import pandas as pd
from pathlib import Path
import numpy as np
import math
import scipy
from scipy.signal import find_peaks
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import datetime
import matplotlib.pyplot as plt
%matplotlib inline
import geopandas as gpd


# import the files for the covid data. I've chosen provincial data. Then eventually I can compare the number of cases, number of hospitalized patients and victims against the mobility patterns in provinces
file_path_covid = r'C:\Users\Govert\OneDrive - Delft University of Technology\Studie\Master jaar 2\Kwartiel 1\TIL6010 - TIL programming\til6010-assignments\Final assignment\RIVM_NL_provincial.csv'

df_covid = pd.read_csv(file_path_covid)

# again drop the unnecessary columns and change the column names to English and match the province column to the mobility dataframe
df_covid = df_covid[(df_covid['Provincienaam'].notna())]
df_covid.rename(columns =      {'Datum': 'date', 
                                'Provincienaam': 'province', 
                                'Provinciecode': 'province_code', 
                                'Type': 'type', 
                                'Aantal': 'number', 
                                'AantalCumulatief': 'number_cumulative'}, inplace=True)

# the covid dataframe has three rows for each data, with in every row either the number of cases, the victims or the number of hospitalized patients. I eventually want only one row per day, so I put everything in columns
df_covid['number of casualties'] = np.where(df_covid['type'] == 'Overleden', df_covid['number'], np.nan)
df_covid['number of hospitalized patients'] = np.where(df_covid['type'] == 'Ziekenhuisopname', df_covid['number'], np.nan)
df_covid['number of cases'] = np.where(df_covid['type'] == 'Totaal', df_covid['number'], np.nan)
df_covid['number of casualties_cum'] = np.where(df_covid['type'] == 'Overleden', df_covid['number_cumulative'], np.nan)
df_covid['number of hospitalized patients_cum'] = np.where(df_covid['type'] == 'Ziekenhuisopname', df_covid['number_cumulative'], np.nan)
df_covid['number of cases_cum'] = np.where(df_covid['type'] == 'Totaal', df_covid['number_cumulative'], np.nan)

# then I drop the old columns with the information in rows
df_covid = df_covid.drop(columns = ['province_code', 'type', 'number', 'number_cumulative'])

# I turn the dates to datetime
df_datetime_1 = pd.to_datetime(df_covid['date'],format='%d/%m/%Y')
df_covid['date'] = df_datetime_1

# Then I sort the columns again to make it clearer
df_covid = df_covid.sort_values(by=['province', 'date'], ascending=[True,True])
df_covid.reset_index(inplace=True)
df_covid = df_covid.drop(columns = ['index'])
df_covid.head(-10)


# I create an empty dataframe with the same columns as the covid dataframe
rows = []
columns = []
df_covid_final = pd.DataFrame(rows, columns = df_covid.columns.values.tolist())
df_covid_month_final = pd.DataFrame(rows, columns = df_covid.columns.values.tolist())

# I create a list of the unique provinces in the covid dataframe to loop through
provinces = df_covid['province'].unique()

# I resample each province separately, because otherwise the all provinces end up with identical data
for province in provinces:
    df_province_1 = (df_covid.loc[df_covid['province'] == province])
    df_province_1_agg = (df_province_1.resample('7D',on='date').mean())
    df_province_1_agg.reset_index(inplace=True)
    df_province_1_agg['province'] = province
    df_covid_final = pd.concat((df_covid_final, df_province_1_agg))

# The monthly dataframe comes in handy later during the assignment
    df_province_1_aggmonth = (df_province_1.resample('M',on='date').sum())
    df_province_1_aggmonth.reset_index(inplace=True)
    df_province_1_aggmonth['province'] = province
    df_covid_month_final = pd.concat((df_covid_month_final,df_province_1_aggmonth))

df_covid_final.reset_index(inplace=True)
df_covid_month_final.reset_index(inplace=True)

df_covid_final.head(-10)


# import the files for the mobility data
file_path_mobility_2020 = r'C:\Users\Govert\OneDrive - Delft University of Technology\Studie\Master jaar 2\Kwartiel 1\TIL6010 - TIL programming\til6010-assignments\Final assignment\2020_NL_Region_Mobility_report.csv'
file_path_mobility_2021 = r'C:\Users\Govert\OneDrive - Delft University of Technology\Studie\Master jaar 2\Kwartiel 1\TIL6010 - TIL programming\til6010-assignments\Final assignment\2021_NL_Region_Mobility_report.csv'

# open the files as dataframes
df_mob_2020 = pd.read_csv(file_path_mobility_2020)
df_mob_2021 = pd.read_csv(file_path_mobility_2021)

# combine the mobility dataframes and only leave the provincial data and drop all the unnecessary columns and rename most of the columns
df_mob_combined = pd.concat((df_mob_2020, df_mob_2021))
df_mobility = df_mob_combined[(df_mob_combined['sub_region_1'].notna()) & (df_mob_combined['sub_region_2'].isna())]
df_mobility = df_mobility.drop(columns = ['country_region_code', 'country_region', 'sub_region_2', 'metro_area', 'iso_3166_2_code', 'census_fips_code', 'place_id'])
df_mobility.rename(columns =    {'sub_region_1': 'province', 
                                'retail_and_recreation_percent_change_from_baseline': 'retail and recreation trips', 
                                'grocery_and_pharmacy_percent_change_from_baseline': 'grocery and pharmacy trips', 
                                'parks_percent_change_from_baseline': 'park trips',
                                'transit_stations_percent_change_from_baseline': 'transit station trips',
                                'workplaces_percent_change_from_baseline': 'work trips',
                                'residential_percent_change_from_baseline': 'residential trips'},inplace=True)

# set the date and reformat the dataframes
df_datetime_2 = pd.to_datetime(df_mobility['date'],format='%d/%m/%Y')
df_mobility['date'] = df_datetime_2
df_mobility = df_mobility.sort_values(by=['province','date'],ascending=[True,True])

# Reorder the columns
cols = list(df_mobility.columns.values)
df_mobility = df_mobility[[cols[1]] + [cols[0]]+ cols[2:8]]
df_mobility.head(-10)


# Again I create an empty dataframe, this time with the columns of the mobility dataframe
rows = []
columns = []
df_mobility_final = pd.DataFrame(rows, columns = df_mobility.columns.values.tolist())
df_mobility_month_final = pd.DataFrame(rows, columns = df_mobility.columns.values.tolist())

# Again a list of all unique provinces
provinces = df_mobility['province'].unique()

# Then I loop through the provinces again, but this time I also drop the rows at the beginning until the date the covid dataframe starts. This way the mobility data is resampled to the same days/weeks as the covid data
for province in provinces:
    df_province_2 = (df_mobility.loc[df_mobility['province'] == province])
    df_province_2 = df_province_2.drop(df_province_2.index[0:12])
    df_province_2_agg = (df_province_2.resample('7D',on='date').mean())
    df_province_2_agg.reset_index(inplace=True)
    df_province_2_agg['province'] = province
    df_mobility_final = pd.concat((df_mobility_final, df_province_2_agg))

# The monthly dataframe comes in handy later during the assignment
    df_province_2_aggmonth = (df_province_2.resample('M',on='date').sum())
    df_province_2_aggmonth.reset_index(inplace=True)
    df_province_2_aggmonth['province'] = province
    df_mobility_month_final = pd.concat((df_mobility_month_final,df_province_2_aggmonth))

df_mobility_final.reset_index(inplace=True)
df_mobility_month_final.reset_index(inplace=True)

df_mobility_final.head(-10)


# then I merge the covid and the mobility dataframes and drop the columns with the old indices
df_full = df_covid_final.merge(df_mobility_final, how='inner', left_on=['date','province'], right_on=['date','province'])
df_full = df_full.drop(columns=['index_x','index_y'])

# The monthly dataframe is also merged the same way
df_full_month = df_covid_month_final.merge(df_mobility_month_final,how='inner', left_on=['date','province'], right_on=['date','province'])
df_full_month = df_full_month.drop(columns=['index_x','index_y'])

df_full.head(-10)


# In this block, all the variables can be adjusted. The defaults can be seen below. 
# The weeks variables are the margins for the common peak finding. 
activity_1 = 'work trips'
activity_2 = 'grocery and pharmacy trips'
activities_1 = [activity_1, activity_2]
weeks_1 = 1

activity_3 = 'number of hospitalized patients'
activity_4 = 'number of casualties'
activities_2 = [activity_3, activity_4]
weeks_2 = 1

province_1 = 'Noord-Brabant'

weeks_3 = 1


# first, I'm going to use my find_peaks function from assignment 2 to give the peaks in a list for each province for a specific activity
def my_find_peaks(data, activity, **kwargs):

    diff_1 = data[activity].diff(periods = -1)
    diff_2 = data[activity].diff(periods = 1)
    
    peaks = []
    for i in range(len(diff_1)):
        if diff_1[i] > 0 and diff_2[i] > 0:
            peaks.append(int(i))          
            
    return peaks

# And do the same for the valleys
def my_find_valleys(data, activity, **kwargs):

    diff_1 = data[activity].diff(periods = -1)
    diff_2 = data[activity].diff(periods = 1)

    valleys = []
    for i in range(len(diff_1)):
        if diff_1[i] < 0 and diff_2[i] < 0:
            valleys.append(int(i))          
            
    return valleys


# Then I start the fig_1ure and create several dictionaries that are necessary. The peaks and valleys dictionaries are for the graphs and the date dictionaries are for the next steps
fig_1 = go.Figure()
peaks_dict_1 = {}
valleys_dict_1 = {}
peaks_date_dict_1 = {}
valleys_date_dict_1 = {}

# I create a dataframe that contains only the data for the selected province and reset the indices for it
df_full_province_1 = (df_full.loc[df_full['province'] == province_1])
df_full_province_1.reset_index(inplace=True)

# I find the peaks and valleys and add them to the dictionaries
for activity in activities_1:
    max_ind = my_find_peaks(df_full_province_1, activity)
    peaks_dict_1[activity]=max_ind

    min_ind = my_find_valleys(df_full_province_1,activity)
    valleys_dict_1[activity]=min_ind
    
    # Then I turn them into dataframes to be able to use the dates for the graphs, and for the date dictionaries
    df_max_1 = df_full_province_1.iloc[max_ind]
    df_min_1 = df_full_province_1.iloc[min_ind]

    # The date dictionaries are filled with the dates of the peaks and the valleys
    peaks_date_dict_1[activity] = df_max_1['date']
    valleys_date_dict_1[activity] = df_min_1['date']
    
    # The graphs are formatted
    x1 = df_full_province_1['date']
    y1 = df_full_province_1[activity]
    x2 = df_max_1['date']
    y2 = df_max_1[activity]
    x3 = df_min_1['date']
    y3 = df_min_1[activity]
    fig_1.add_trace(go.Scatter(x=x1,y=y1,name=activity))
    fig_1.add_trace(go.Scatter(x=x2,y=y2,mode='markers',name='peaks ' + activity))
    fig_1.add_trace(go.Scatter(x=x3,y=y3,mode='markers',name='valleys ' + activity))

fig_1.update_layout(title= activity_1 + ' and ' + activity_2 + ' during covid times in ' + province_1)
fig_1.show()


# Now I'm going to compare the two date dictionaries and see if there are any common peaks
# The margin can be adapted by changing the number for the weeks_1 variable. The default is set to a margin of 1 week
margin_1 = list(range(1,weeks_1+1))

# A dictionary is made with the three lists for common peaks in the same week, peak B one week after peak A or peak B one week before peak A
common_peaks_1 = {}
common_peaks_1['in same week'] = []
common_peaks_1['in margin_1 after'] = []
common_peaks_1['in margin_1 before'] = []

# Then all the peaks are for-looped to add them to the right lists
for date1 in peaks_date_dict_1[activity_1]:
    for date2 in peaks_date_dict_1[activity_2]:
        for number in margin_1:
            if date1.date() == date2.date():
                common_peaks_1['in same week'].append(str(date1.date()))
            if date1.date() + datetime.timedelta(weeks=number) == date2.date():
                common_peaks_1['in margin_1 after'].append(str(date1.date()))
            if date1.date() - datetime.timedelta(weeks=number) == date2.date():
                common_peaks_1['in margin_1 before'].append(str(date1.date()))

# And then they are printed
print('Peaks for', activity_1, 'for which', activity_2, 'has a peak in the same week in', province_1, ':', common_peaks_1['in same week'])
print('Peaks for', activity_1, 'for which', activity_2, 'has a peak within', weeks_1, 'week(s) prior in', province_1, ':', common_peaks_1['in margin_1 before'])
print('Peaks for', activity_1, 'for which', activity_2, 'has a peak within', weeks_1, 'week(s) after in', province_1, ':', common_peaks_1['in margin_1 after'])

Peaks for work trips for which grocery and pharmacy trips has a peak in the same week in Noord-Brabant : ['2020-04-02', '2020-05-14', '2020-06-18', '2020-09-24', '2021-01-07']
Peaks for work trips for which grocery and pharmacy trips has a peak within 1 week(s) prior in Noord-Brabant : ['2020-08-27', '2020-10-29', '2020-12-03', '2021-01-28']
Peaks for work trips for which grocery and pharmacy trips has a peak within 1 week(s) after in Noord-Brabant : ['2020-03-05', '2020-04-16', '2021-03-18']


# Then I do the same for the valleys
common_valleys_1 = {}
common_valleys_1['in same week'] = []
common_valleys_1['in margin_1 after'] = []
common_valleys_1['in margin_1 before'] = []

for date1 in valleys_date_dict_1[activity_1]:
    for date2 in valleys_date_dict_1[activity_2]:
        for number in margin_1:
            if date1.date() == date2.date():
                common_valleys_1['in same week'].append(str(date1.date()))
            if date1.date() + datetime.timedelta(weeks=number) == date2.date():
                common_valleys_1['in margin_1 after'].append(str(date1.date()))
            if date1.date() - datetime.timedelta(weeks=number) == date2.date():
                common_valleys_1['in margin_1 before'].append(str(date1.date()))

print('Valleys for', activity_1, 'for which', activity_2, 'has a valley in the same week in', province_1, ':', common_valleys_1['in same week'])
print('Valleys for', activity_1, 'for which', activity_2, 'has a valley within', weeks_1, 'week(s) prior in', province_1, ':', common_valleys_1['in margin_1 before'])
print('Valleys for', activity_1, 'for which', activity_2, 'has a valley within', weeks_1, 'week(s) after in', province_1, ':', common_valleys_1['in margin_1 after'])

Valleys for work trips for which grocery and pharmacy trips has a valley in the same week in Noord-Brabant : ['2020-03-26', '2020-04-09', '2020-05-21', '2020-08-06', '2020-09-03', '2020-12-24', '2021-01-14', '2021-04-01']
Valleys for work trips for which grocery and pharmacy trips has a valley within 1 week(s) prior in Noord-Brabant : ['2020-10-15', '2020-11-05', '2021-02-04']
Valleys for work trips for which grocery and pharmacy trips has a valley within 1 week(s) after in Noord-Brabant : ['2020-04-23', '2020-10-01']


# And then the same is done for the covid data
fig_2 = go.Figure()
peaks_dict_2 = {}
valleys_dict_2 = {}
peaks_date_dict_2 = {}
valleys_date_dict_2 = {}
df_full_province_2 = (df_full.loc[df_full['province'] == province_1])
df_full_province_2.reset_index(inplace=True)

for activity in activities_2:
    max_ind = my_find_peaks(df_full_province_2, activity)
    peaks_dict_2[activity]=max_ind

    min_ind = my_find_valleys(df_full_province_2,activity)
    valleys_dict_2[activity]=min_ind
    
    df_max_2 = df_full_province_2.iloc[max_ind]
    df_min_2 = df_full_province_2.iloc[min_ind]
    peaks_date_dict_2[activity] = df_max_2['date']
    valleys_date_dict_2[activity] = df_min_2['date']
    
    x1 = df_full_province_2['date']
    y1 = df_full_province_2[activity]
    x2 = df_max_2['date']
    y2 = df_max_2[activity]
    x3 = df_min_2['date']
    y3 = df_min_2[activity]
    fig_2.add_trace(go.Scatter(x=x1,y=y1,name=activity))
    fig_2.add_trace(go.Scatter(x=x2,y=y2,mode='markers',name='peaks ' + activity))
    fig_2.add_trace(go.Scatter(x=x3,y=y3,mode='markers',name='valleys ' + activity))

fig_2.update_layout(title=activity_3 + ' and ' + activity_4 + ' because of covid in ' + province_1)
fig_2.show()


# Same as for mobility data
margin_2 = list(range(1,weeks_2+1))

common_peaks_2 = {}
common_peaks_2['in same week'] = []
common_peaks_2['in margin_2 after'] = []
common_peaks_2['in margin_2 before'] = []

for date3 in peaks_date_dict_2[activity_3]:
    for date4 in peaks_date_dict_2[activity_4]:
        for number in margin_2:
            if date3.date() == date4.date():
                common_peaks_2['in same week'].append(str(date3.date()))
            if date3.date() + datetime.timedelta(weeks=number) == date4.date():
                common_peaks_2['in margin_2 after'].append(str(date3.date()))
            if date3.date() - datetime.timedelta(weeks=number) == date4.date():
                common_peaks_2['in margin_2 before'].append(str(date3.date()))

print('Peaks for', activity_3, 'for which', activity_4, 'has a peak in the same week in', province, ':', common_peaks_2['in same week'])
print('Peaks for', activity_3, 'for which', activity_4, 'has a peak within', weeks_2, 'week(s) prior in', province, ':', common_peaks_2['in margin_2 before'])
print('Peaks for', activity_3, 'for which', activity_4, 'has a peak within', weeks_2, 'week(s) after in', province, ':', common_peaks_2['in margin_2 after'])

Peaks for number of hospitalized patients for which number of casualties has a peak in the same week in Zuid-Limburg : ['2020-03-26', '2020-05-28', '2020-12-17', '2021-01-07', '2021-04-08']
Peaks for number of hospitalized patients for which number of casualties has a peak within 1 week(s) prior in Zuid-Limburg : ['2020-04-23', '2020-11-12', '2021-02-18']
Peaks for number of hospitalized patients for which number of casualties has a peak within 1 week(s) after in Zuid-Limburg : ['2020-10-29']


# Same as for mobility data
common_valleys_2 = {}
common_valleys_2['in same week'] = []
common_valleys_2['in margin_2 after'] = []
common_valleys_2['in margin_2 before'] = []

for date3 in valleys_date_dict_2[activity_3]:
    for date4 in valleys_date_dict_2[activity_4]:
        for number in margin_2:
            if date3.date() == date4.date():
                common_valleys_2['in same week'].append(str(date3.date()))
            if date3.date() + datetime.timedelta(weeks=number) == date4.date():
                common_valleys_2['in margin_2 after'].append(str(date3.date()))
            if date3.date() - datetime.timedelta(weeks=number) == date4.date():
                common_valleys_2['in margin_2 before'].append(str(date3.date()))

print('Valleys for', activity_3, 'for which', activity_4, 'has a peak in the same week in', province, ':', common_valleys_2['in same week'])
print('Valleys for', activity_3, 'for which', activity_4, 'has a peak within', weeks_2, 'week(s) prior in', province, ':', common_valleys_2['in margin_2 before'])
print('Valleys for', activity_3, 'for which', activity_4, 'has a peak within', weeks_2, 'week(s) after in', province, ':', common_valleys_2['in margin_2 after'])

Valleys for number of hospitalized patients for which number of casualties has a peak in the same week in Zuid-Limburg : ['2020-05-21', '2020-07-09', '2020-09-10', '2021-03-04', '2021-04-01']
Valleys for number of hospitalized patients for which number of casualties has a peak within 1 week(s) prior in Zuid-Limburg : ['2020-04-16', '2020-11-26']
Valleys for number of hospitalized patients for which number of casualties has a peak within 1 week(s) after in Zuid-Limburg : ['2020-11-26', '2020-12-24']


# The same as for the previous questions, but now with peaks for covid and valleys for mobility
margin_3 = list(range(1,weeks_3+1))

common_peaks_3 = {}
common_peaks_3['in same week'] = []
common_peaks_3['in margin_2 after'] = []

for date3 in peaks_date_dict_2[activity_3]:
    for date2 in valleys_date_dict_1[activity_2]:
        for number in margin_3:
            if date3.date() == date2.date():
                common_peaks_3['in same week'].append(str(date3.date()))
            if date3.date() + datetime.timedelta(weeks=number) == date2.date():
                common_peaks_3['in margin_2 after'].append(str(date3.date()))

print('Peaks in', activity_3, 'for which', activity_2, 'has a valley in the same week in', province, ':', common_peaks_3['in same week'])
print('Peaks in', activity_3, 'for which', activity_2, 'has a valley within', weeks_3, 'week(s) after in', province, ':', common_peaks_3['in margin_2 after'])

Peaks in number of hospitalized patients for which grocery and pharmacy trips has a valley in the same week in Zuid-Limburg : ['2020-03-26', '2020-10-29']
Peaks in number of hospitalized patients for which grocery and pharmacy trips has a valley within 1 week(s) after in Zuid-Limburg : ['2020-04-23', '2020-05-28', '2020-07-30', '2020-12-17', '2021-01-07', '2021-01-21', '2021-03-25']


# Two different graphs as the units are different, but in a subplot
fig_3 = make_subplots(rows=2,cols=1)

x1 = df_full_province_2['date']
y1 = df_full_province_2[activity_3]
x2 = df_max_2['date']
y2 = df_max_2[activity_3]
x3 = df_full_province_1['date']
y3 = df_full_province_1[activity_2]
x4 = df_min_1['date']
y4 = df_min_1[activity_2]

fig_3.append_trace(go.Scatter(x=x1,y=y1,name=activity_3),row=1,col=1)
fig_3.append_trace(go.Scatter(x=x2,y=y2,mode='markers',name='peaks ' + activity_3),row=1,col=1)
fig_3.append_trace(go.Scatter(x=x3,y=y3,name=activity_2),row=2,col=1)
fig_3.append_trace(go.Scatter(x=x4,y=y4,mode='markers',name='valleys ' + activity_2),row=2,col=1)

fig_3.update_layout(title='Covid and mobility data in ' + province_1)

fig_3.show()


# Variables from previous part
activity_1 = 'work trips'
activity_2 = 'grocery and pharmacy trips'
activity_3 = 'number of hospitalized patients'
activity_4 = 'number of casualties'

province_1 = 'Noord-Brabant'

# New variable
activity_5 = 'number of cases'
activity_6 = 'retail and recreation trips'
activities_story_1 = [activity_3, activity_4, activity_5]
activities_story_2 = [activity_1, activity_2, activity_6]
activities_story_3 = [activity_5, activity_2]

province_2 = 'Noord-Holland'
province_3 = 'Zuid-Holland'
provinces_story = [province_1, province_2, province_3]

weeks_4 = 2


# First, I'll make a graph of all covid data on the province of Noord-Brabant
# I have to add the number of cases as a new activity and then I'm able to make the graph
fig_5 = go.Figure()

for activity in activities_story_1:
    x1 = df_full_province_2['date']
    y1 = df_full_province_2[activity]
    fig_5.add_trace(go.Scatter(x=x1,y=y1,name=activity))


fig_5.update_layout(title='Covid data in ' + province_1)
fig_5.show()


# Making a bar chart that can be toggled for the months. Here the monthly dataframe created in the first part comes in handy!
# First a column is added to the df_month that only shows the year and the month
df_full_month['year_month'] = df_full_month['date'].dt.strftime('%Y-%m')

# Then the bar chart can be created using this column
fig_6 = px.bar(df_full_month, x= 'number of cases', y='province', orientation='h',color='province',
                animation_frame='year_month', animation_group='province')
fig_6.update_layout(showlegend=False,xaxis_range = [0,100])
fig_6.update_layout(yaxis={'categoryorder':'total ascending'})
fig_6.show()


fp = 'https://geodata.nationaalgeoregister.nl/cbsgebiedsindelingen/wfs?request=GetFeature&service=WFS&version=1.1.0&typeName=cbsgebiedsindelingen:cbs_provincie_2021_gegeneraliseerd&outputFormat=json'
df_map = gpd.read_file(fp)
df_map['statnaam'] = df_map['statnaam'].replace(['Fryslân'],'Friesland')
df_map['statnaam'] = df_map['statnaam'].replace(['Limburg'],'Zuid-Limburg')

latest_date = '2021-02-04'
df_cumul = df_full.loc[df_full['date'] == latest_date]

merged = df_map.set_index('statnaam').join(df_cumul.set_index('province'))

activity_7 = 'number of cases_cum'
vmin, vmax = 10000,150000
fig, ax = plt.subplots(1,figsize=(8,6))
ax.axis('off')
ax.set_title('Cumulative number of covid cases per province on the 4th of February 2021', fontdict={'fontsize': '25', 'fontweight': '3'})

sm = plt.cm.ScalarMappable(cmap='Blues', norm=plt.Normalize(vmin=vmin, vmax=vmax))
sm.A = []
cbar = fig.colorbar(sm)

merged.plot(column=activity_7,cmap='Blues',linewidth=0.6,ax=ax,edgecolor='0.2')

<AxesSubplot:title={'center':'Cumulative number of covid cases per province on the 4th of February 2021'}>


# Making subplots for the different activities to compare between provinces
fig_8 = make_subplots(rows=len(activities_story_2),cols=1,subplot_titles=(activities_story_1))

for i, activity in enumerate(activities_story_1):
    for province in provinces_story:
        df_province_story_2 = (df_full.loc[df_full['province'] == province])
        df_province_story_2.reset_index(inplace=True)
        x1 = df_province_story_2['date']
        y1 = df_province_story_2[activity]
        fig_8.append_trace(go.Scatter(x=x1,y=y1,name=activity + ' ' + province),row=i+1,col=1)

fig_8.update_layout(title='Comparing covid data between ' + province_1 + ', ' + province_2 + ' and ' + province_3)
fig_8.show()


# Making subplots for the different activities to compare between provinces
fig_8 = make_subplots(rows=len(activities_story_2),cols=1,subplot_titles=(activities_story_2))

for i, activity in enumerate(activities_story_2):
    for province in provinces_story:
        df_province_story_2 = (df_full.loc[df_full['province'] == province])
        df_province_story_2.reset_index(inplace=True)
        x1 = df_province_story_2['date']
        y1 = df_province_story_2[activity]
        fig_8.append_trace(go.Scatter(x=x1,y=y1,name=activity + ' ' + province),row=i+1,col=1)

fig_8.update_layout(title='Comparing relative trip numbers during covid times between ' + province_1 + ', ' + province_2 + ' and ' + province_3)
fig_8.show()


# The same idea done in the second part, but now for multiple provinces at the same time
peaks_date_dict_3 = {}
valleys_date_dict_3 = {}

margin_4 = list(range(1,weeks_4+1))

common_peaks_4 = {}
common_peaks_4['in same week'] = []
common_peaks_4['in margin_2 after'] = []

# Here the for loop with the provinces is added
for province in provinces_story:
    df_province_story_3 = (df_full.loc[df_full['province'] == province])
    df_province_story_3.reset_index(inplace=True)
    for activity in activities_story_3:
        max_ind = my_find_peaks(df_province_story_3, activity)
        min_ind = my_find_valleys(df_province_story_3,activity)

        df_max_3 = df_province_story_3.iloc[max_ind]
        df_min_3 = df_province_story_3.iloc[min_ind]
        peaks_date_dict_3[activity] = df_max_3['date']
        valleys_date_dict_3[activity] = df_min_3['date']        

    for date5 in peaks_date_dict_3[activity_5]:
        for date2 in valleys_date_dict_3[activity_2]:
            for number in margin_4:
                if date5.date() == date2.date():
                    common_peaks_4['in same week'].append(str(date5.date()))
                if date5.date() + datetime.timedelta(weeks=number) == date2.date():
                    common_peaks_4['in margin_2 after'].append(str(date5.date()))

    unique_common_peaks_4 = {a:list(set(b)) for a, b in common_peaks_4.items()}

    print('Peaks in', activity_5, 'for which', activity_2, 'has a valley in the same week in', province, ':', unique_common_peaks_4['in same week'])
    print('Peaks in', activity_5, 'for which', activity_2, 'has a valley within', weeks_4, 'week(s) after in', province, ':', unique_common_peaks_4['in margin_2 after'])

Peaks in number of cases for which grocery and pharmacy trips has a valley in the same week in Noord-Brabant : ['2020-04-09', '2020-10-29', '2020-03-26', '2020-08-06', '2020-06-04']
Peaks in number of cases for which grocery and pharmacy trips has a valley within 2 week(s) after in Noord-Brabant : ['2020-12-17', '2020-03-26', '2021-03-25', '2020-11-19', '2020-08-20']
Peaks in number of cases for which grocery and pharmacy trips has a valley in the same week in Noord-Holland : ['2020-04-09', '2020-10-29', '2020-03-26', '2020-08-06', '2020-06-04']
Peaks in number of cases for which grocery and pharmacy trips has a valley within 2 week(s) after in Noord-Holland : ['2021-02-25', '2020-12-17', '2020-10-22', '2020-03-26', '2021-03-25', '2020-11-19', '2020-08-20', '2020-07-02']
Peaks in number of cases for which grocery and pharmacy trips has a valley in the same week in Zuid-Holland : ['2020-04-09', '2020-10-29', '2020-03-26', '2020-08-06', '2020-06-04']
Peaks in number of cases for which grocery and pharmacy trips has a valley within 2 week(s) after in Zuid-Holland : ['2021-02-25', '2020-12-17', '2020-10-22', '2020-03-26', '2020-06-25', '2020-08-06', '2021-03-25', '2020-11-19', '2020-08-20', '2020-07-02']

	date	province	number of casualties	number of hospitalized patients	number of cases	number of casualties_cum	number of hospitalized patients_cum	number of cases_cum
0	2020-02-27	Drenthe	NaN	NaN	NaN	NaN	NaN	NaN
1	2020-02-27	Drenthe	NaN	NaN	NaN	NaN	NaN	0.0
2	2020-02-27	Drenthe	NaN	NaN	NaN	NaN	NaN	NaN
3	2020-02-28	Drenthe	NaN	NaN	NaN	NaN	NaN	NaN
4	2020-02-28	Drenthe	NaN	NaN	0.0	NaN	NaN	0.0
...	...	...	...	...	...	...	...	...
14925	2021-04-13	Zuid-Limburg	2.0	NaN	NaN	1533.0	NaN	NaN
14926	2021-04-13	Zuid-Limburg	NaN	NaN	562.0	NaN	NaN	90113.0
14927	2021-04-13	Zuid-Limburg	NaN	10.0	NaN	NaN	2405.0	NaN
14928	2021-04-14	Zuid-Limburg	4.0	NaN	NaN	1537.0	NaN	NaN
14929	2021-04-14	Zuid-Limburg	NaN	NaN	468.0	NaN	NaN	90581.0

	index	date	province	number of casualties	number of hospitalized patients	number of cases	number of casualties_cum	number of hospitalized patients_cum	number of cases_cum
0	0	2020-02-27	Drenthe	NaN	NaN	0.333333	NaN	NaN	0.857143
1	1	2020-03-05	Drenthe	NaN	NaN	1.428571	NaN	NaN	7.285714
2	2	2020-03-12	Drenthe	0.000000	0.200000	1.428571	0.000000	0.500000	15.857143
3	3	2020-03-19	Drenthe	0.285714	1.857143	5.428571	1.428571	6.285714	43.714286
4	4	2020-03-26	Drenthe	0.428571	4.142857	11.714286	3.285714	28.285714	105.714286
...	...	...	...	...	...	...	...	...	...
705	45	2021-01-07	Zuid-Limburg	8.285714	6.857143	607.142857	1146.142857	2013.857143	53463.142857
706	46	2021-01-14	Zuid-Limburg	8.000000	5.857143	518.000000	1205.142857	2063.000000	57374.714286
707	47	2021-01-21	Zuid-Limburg	6.285714	4.571429	428.714286	1254.571429	2098.714286	60704.142857
708	48	2021-01-28	Zuid-Limburg	5.428571	4.428571	365.428571	1294.714286	2127.714286	63372.714286
709	49	2021-02-04	Zuid-Limburg	5.142857	4.000000	293.857143	1328.571429	2159.571429	65731.000000

	date	province	retail and recreation trips	grocery and pharmacy trips	park trips	transit station trips	work trips	residential trips
321	2020-02-15	Drenthe	-6.0	0.0	18.0	6.0	1.0	0.0
322	2020-02-16	Drenthe	-10.0	-10.0	-38.0	-12.0	-2.0	1.0
323	2020-02-17	Drenthe	3.0	-2.0	52.0	-24.0	-22.0	3.0
324	2020-02-18	Drenthe	8.0	0.0	45.0	-21.0	-22.0	3.0
325	2020-02-19	Drenthe	4.0	-4.0	49.0	-19.0	-22.0	2.0
...	...	...	...	...	...	...	...	...
26570	2021-08-27	Zuid-Limburg	6.0	13.0	148.0	-21.0	-37.0	5.0
26571	2021-08-28	Zuid-Limburg	-9.0	2.0	107.0	-5.0	4.0	1.0
26572	2021-08-29	Zuid-Limburg	-11.0	4.0	47.0	-6.0	11.0	1.0
26573	2021-08-30	Zuid-Limburg	25.0	18.0	162.0	-15.0	-34.0	5.0
26574	2021-08-31	Zuid-Limburg	27.0	22.0	236.0	-10.0	-33.0	5.0

	index	date	province	retail and recreation trips	grocery and pharmacy trips	park trips	transit station trips	work trips	residential trips
0	0	2020-02-27	Drenthe	3.142857	2.714286	14.000000	1.285714	3.142857	0.428571
1	1	2020-03-05	Drenthe	2.142857	1.857143	10.857143	-0.857143	2.571429	0.285714
2	2	2020-03-12	Drenthe	-15.714286	11.571429	17.428571	-31.714286	-17.285714	7.428571
3	3	2020-03-19	Drenthe	-39.000000	-9.714286	14.285714	-54.571429	-37.428571	13.571429
4	4	2020-03-26	Drenthe	-43.714286	-18.714286	5.285714	-59.285714	-40.571429	14.571429
...	...	...	...	...	...	...	...	...	...
957	66	2021-06-03	Zuid-Limburg	2.285714	17.285714	177.428571	-15.714286	-8.285714	4.428571
958	67	2021-06-10	Zuid-Limburg	4.714286	18.285714	203.714286	-11.571429	-7.142857	2.857143
959	68	2021-06-17	Zuid-Limburg	-0.857143	14.428571	160.285714	-17.428571	-7.857143	3.714286
960	69	2021-06-24	Zuid-Limburg	5.285714	14.285714	168.142857	-13.571429	-8.571429	2.857143
961	70	2021-07-01	Zuid-Limburg	8.571429	14.428571	166.285714	-12.000000	-8.571429	3.428571

	date	province	number of casualties	number of hospitalized patients	number of cases	number of casualties_cum	number of hospitalized patients_cum	number of cases_cum	retail and recreation trips	grocery and pharmacy trips	park trips	transit station trips	work trips	residential trips
0	2020-02-27	Drenthe	NaN	NaN	0.333333	NaN	NaN	0.857143	3.142857	2.714286	14.000000	1.285714	3.142857	0.428571
1	2020-03-05	Drenthe	NaN	NaN	1.428571	NaN	NaN	7.285714	2.142857	1.857143	10.857143	-0.857143	2.571429	0.285714
2	2020-03-12	Drenthe	0.000000	0.200000	1.428571	0.000000	0.500000	15.857143	-15.714286	11.571429	17.428571	-31.714286	-17.285714	7.428571
3	2020-03-19	Drenthe	0.285714	1.857143	5.428571	1.428571	6.285714	43.714286	-39.000000	-9.714286	14.285714	-54.571429	-37.428571	13.571429
4	2020-03-26	Drenthe	0.428571	4.142857	11.714286	3.285714	28.285714	105.714286	-43.714286	-18.714286	5.285714	-59.285714	-40.571429	14.571429
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
705	2021-01-07	Zuid-Limburg	8.285714	6.857143	607.142857	1146.142857	2013.857143	53463.142857	-60.714286	-12.571429	-6.000000	-49.000000	-25.285714	12.142857
706	2021-01-14	Zuid-Limburg	8.000000	5.857143	518.000000	1205.142857	2063.000000	57374.714286	-61.857143	-15.285714	-9.571429	-50.142857	-27.000000	11.571429
707	2021-01-21	Zuid-Limburg	6.285714	4.571429	428.714286	1254.571429	2098.714286	60704.142857	-60.714286	-12.571429	3.714286	-48.285714	-26.571429	12.000000
708	2021-01-28	Zuid-Limburg	5.428571	4.428571	365.428571	1294.714286	2127.714286	63372.714286	-60.714286	-12.857143	-10.714286	-50.000000	-25.571429	12.142857
709	2021-02-04	Zuid-Limburg	5.142857	4.000000	293.857143	1328.571429	2159.571429	65731.000000	-61.571429	-12.285714	-2.857143	-52.571429	-28.571429	12.571429

TIL6010 - final assignment G.P. van Loon 5408393¶

Part I - Data import¶

Part II - Data processing¶

Part III - Data visualisation¶