import pandas as pd
import numpy as np
from scipy.signal import find_peaks
from scipy.stats import pearsonr
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


# Read the csv file of the mobility data from New Zealand as a dataframe
df_mobi_nz = pd.read_csv('NZ_nation.csv')
df_mobi_nz = df_mobi_nz.iloc[: , 1:]

# Clean the data by dropping columns of the dataframe that will not be used
df_mobi_nz = df_mobi_nz.drop(columns=['country_region_code', 'country_region', 'sub_region_1', 
                                      'sub_region_2','metro_area', 'iso_3166_2_code', 
                                      'census_fips_code', 'place_id'])

# Read the data of New Zealand from the csv file of the Covid-19 data as a dataframe
df_covid = pd.read_csv('owid-covid-data.csv')
df_covid_nz = df_covid[df_covid.loc[:, 'location'] == 'New Zealand']

# Clean the data by dropping columns of the dataframe that will not be used
df_covid_nz = df_covid_nz.drop(columns=['iso_code', 'continent', 'location']) 
    
# Join both dataframes on the column 'date' to one dataframe
df_nz = pd.merge(df_mobi_nz, df_covid_nz, how='outer', on='date')


act_list = ['retail_and_recreation_percent_change_from_baseline', 'grocery_and_pharmacy_percent_change_from_baseline', 
            'parks_percent_change_from_baseline', 'transit_stations_percent_change_from_baseline',
            'workplaces_percent_change_from_baseline', 'residential_percent_change_from_baseline'
           ]


# INITIALISATION

# Use find_peaks to determine the indices of the peaks of the first activity
peaks_act, _ = find_peaks(df_nz[act_list[0]], height=10, distance=5)

# Use find_peaks to determine the indices of the peaks of the second activity
peaks_act_two, _ = find_peaks(df_nz[act_list[1]], height=10, distance=5)

# Use find_peaks to determine the indices of the valleys of the first activity
valleys_act, _ = find_peaks(-(df_nz[act_list[0]]), height=10, distance=5)

# Use find_peaks to determine the indices of the valleys of the second activity
valleys_act_two, _ = find_peaks(-(df_nz[act_list[1]]), height=10, distance=5)


# Create empty lists
match_act = []
match_act_two = []
date_list_act = []

# Retrieve only date column from dataframe 'df'
df_date = df_nz['date']

# Set the offset
offset = 2

# Execute the if-statements
for i in peaks_act:
    for j in peaks_act_two:
        if i == j:
            match_act.append(i)
            match_act_two.append(j)
            date_list_act.append(df_date.iloc[i])
        elif (i - offset) == j:
            match_act.append(i)
            match_act_two.append(j)
            date_list_act.append(df_date.iloc[i-offset])
        elif (i + offset) == j:
            match_act.append(i)
            match_act_two.append(j)
            date_list_act.append(df_date.iloc[i+offset])
            
# Convert the lists to arrays        
df_max_act = np.array(df_nz[act_list[0]].iloc[match_act])
df_max_act_two = np.array(df_nz[act_list[1]].iloc[match_act_two])
date_list_act = np.array(date_list_act)

# Make a dataframe from all the arrays
df_max_act_all = pd.DataFrame({'date': date_list_act, act_list[0]: df_max_act, act_list[1]: df_max_act_two})

# Display all common peaks of both activities based on weeks starting on Monday 
df_max_act_all


# PLOT THE DATAFRAME

# Plot the dataframe with the common peaks between the two activities
fig = px.scatter(df_max_act_all, x='date', y=[act_list[0], act_list[1]], 
                 title = 'Common peaks between two activities',
                 labels={'date': 'Date', 'value': 'Percentage change', 'variable': 'Type of mobility data'})

# Move the location of the legend to the upper right corner
fig.update_layout(legend=dict(yanchor='top', y=1.25, xanchor='right', x=0.99))

# Show the graph
fig.show()


# Create empty lists
match_act = []
match_act_two = []
date_list_act = []

# Retrieve only date column from dataframe 'df'
df_date = df_nz['date']

# Set the offset
offset = 2

# Execute the if-statements
for i in valleys_act:
    for j in valleys_act_two:
        if i == j:
            match_act.append(i)
            match_act_two.append(j)
            date_list_act.append(df_date.iloc[i])
        elif (i - offset) == j:
            match_act.append(i)
            match_act_two.append(j)
            date_list_act.append(df_date.iloc[i-offset])
        elif (i + offset) == j:
            match_act.append(i)
            match_act_two.append(j)
            date_list_act.append(df_date.iloc[i+offset])
            
# Convert the lists to arrays        
df_min_act = np.array(df_nz[act_list[0]].iloc[match_act])
df_min_act_two = np.array(df_nz[act_list[1]].iloc[match_act_two])
date_list_act = np.array(date_list_act)

# Make a dataframe from all the arrays
df_min_act_all = pd.DataFrame({'date': date_list_act, act_list[0]: df_min_act, act_list[1]: df_min_act_two})

# Display all common valleys of both activities based on weeks starting on Monday 
df_min_act_all


# PLOT THE DATAFRAME

# Plot the dataframe with the common valleys between the two activities
fig = px.scatter(df_min_act_all, x='date', y=[act_list[0], act_list[1]], 
                 title = 'Common valleys between two activities',
                 labels={'date': 'Date', 'value': 'Percentage change', 'variable': 'Type of mobility data'})

# Move the location of the legend to the upper right corner
fig.update_layout(legend=dict(yanchor='top', y=1.25, xanchor='right', x=0.99))

# Show the graph
fig.show()


# INITIALISATION

# Set the first type of Covid-19 data
covid_data = 'new_tests'

# Set the second type of Covid-19 data
covid_data_two = 'new_cases'

# Use find_peaks to determine the indices of the peaks of the first type of Covid-19 data
peaks_cov, _ = find_peaks(df_nz[covid_data], height=1000, distance=5)

# Use find_peaks to determine the indices of the peaks of the first type of Covid-19 data
peaks_cov_two, _ = find_peaks(df_nz[covid_data_two], height=10, distance=5)

# Use find_peaks to determine the indices of the valleys of the first type of Covid-19 data
valleys_cov, _ = find_peaks(-(df_nz[covid_data]), height=10, distance=5)

# Use find_peaks to determine the indices of the valleys of the first type of Covid-19 data
valleys_cov_two, _ = find_peaks(-(df_nz[covid_data_two]), height=1, distance=5)


# Create empty lists
match_cov = []
match_cov_two = []
date_list_cov = []

# Retrieve only date column from dataframe 'df'
df_date = df_nz['date']

# Set the offset
offset = 2

# Execute the if-statements
for i in peaks_cov:
    for j in peaks_cov_two:
        if i == j:
            match_cov.append(i)
            match_cov_two.append(j)
            date_list_cov.append(df_date.iloc[i])
        elif (i - offset) == j:
            match_cov.append(i)
            match_cov_two.append(j)
            date_list_cov.append(df_date.iloc[i-offset])
        elif (i + offset) == j:
            match_cov.append(i)
            match_cov_two.append(j)
            date_list_cov.append(df_date.iloc[i+offset])
            
# Convert the lists to arrays        
df_max_cov = np.array(df_nz[covid_data].iloc[match_cov])
df_max_cov_two = np.array(df_nz[covid_data_two].iloc[match_cov_two])
date_list_cov = np.array(date_list_cov)

# Make a dataframe from all the arrays
df_max_cov_all = pd.DataFrame({'date': date_list_cov, covid_data: df_max_cov, covid_data_two: df_max_cov_two})

# Display all common peaks of both types of Covid-19 data based on weeks starting on Monday 
df_max_cov_all


# PLOT THE DATAFRAME

# Plot the dataframe with the common peaks between the two types of Covid-19 data
fig = px.scatter(df_max_cov_all, x='date', y=[covid_data, covid_data_two], 
                 title = 'Common peaks between two types of Covid-19 data',
                 labels={'date': 'Date', 'value': 'Quantity', 'variable': 'Type of Covid-19 data'})

# Move the location of the legend to the upper right corner
fig.update_layout(legend=dict(yanchor='top', y=1.25, xanchor='right', x=0.99))

# Show the graph
fig.show()


# Create empty lists
match_cov = []
match_cov_two = []
date_list_cov = []

# Retrieve only date column from dataframe 'df'
df_date = df_nz['date']

# Set the offset
offset = 2

# Execute the if-statements
for i in valleys_cov:
    for j in valleys_cov_two:
        if i == j:
            match_cov.append(i)
            match_cov_two.append(j)
            date_list_cov.append(df_date.iloc[i])
        elif (i - offset) == j:
            match_cov.append(i)
            match_cov_two.append(j)
            date_list_cov.append(df_date.iloc[i-offset])
        elif (i + offset) == j:
            match_cov.append(i)
            match_cov_two.append(j)
            date_list_cov.append(df_date.iloc[i+offset])
            
# Convert the lists to arrays        
df_min_cov = np.array(df_nz[covid_data].iloc[match_cov])
df_min_cov_two = np.array(df_nz[covid_data_two].iloc[match_cov_two])
date_list_cov = np.array(date_list_cov)

# Make a dataframe from all the arrays
df_min_cov_all = pd.DataFrame({'date': date_list_cov, covid_data: df_min_cov, covid_data_two: df_min_cov_two})

# Display all common valleys of both types of Covid-19 data based on weeks starting on Monday 
df_min_cov_all


# PLOT THE DATAFRAME

# Plot the dataframe with the common valleys between the two types of Covid-19 data
fig = px.scatter(df_min_cov_all, x='date', y=[covid_data, covid_data_two], 
                 title = 'Common valleys between two types of Covid-19 data',
                 labels={'date': 'Date', 'value': 'Quantity', 'variable': 'Type of Covid-19 data'})

# Move the location of the legend to the upper right corner
fig.update_layout(legend=dict(yanchor='top', y=1.25, xanchor='right', x=0.99))

# Show the graph
fig.show()


# INITIALISATION

# Set the type of Covid-19 data
covid_data = 'new_cases'

# Use find_peaks to determine the indices of the peaks of the activity
peaks_actcov, _ = find_peaks(df_nz[act_list[5]], height=20, distance=5)

# Use find_peaks to determine the indices of the peaks of the type of Covid-19 data
peaks_actcov_two, _ = find_peaks(df_nz[covid_data], height=10, distance=5)

# Use find_peaks to determine the indices of the valleys of the activity
valleys_actcov, _ = find_peaks(-(df_nz[act_list[5]]), height=1, distance=5)

# Use find_peaks to determine the indices of the valleys of the type of Covid-19 data
valleys_actcov_two, _ = find_peaks(-(df_nz[covid_data]), height=1, distance=5)


# Assign the data related to the peaks of the activity to a dataframe
df_max_actcov = df_nz[[act_list[5], 'date']].iloc[peaks_actcov]

# Assign the data related to the peaks of the type of Covid-19 data to a dataframe
df_max_actcov_two = df_nz[[covid_data, 'date']].iloc[peaks_actcov_two]

# Group the dates of the activity on weeks starting on Monday and drop all NaN values
df_max_actcov['date'] = pd.to_datetime(df_max_actcov['date'])
df_group_actcov = df_max_actcov.groupby(pd.Grouper(freq='W-MON', key='date'))[act_list[5]].max().to_frame(act_list[5]).reset_index()
df_group_actcov = df_group_actcov.dropna(thresh=2)

# Group the dates of the type of Covid-19 data on weeks starting on Monday and drop all NaN values
df_max_actcov_two['date'] = pd.to_datetime(df_max_actcov_two['date'])
df_group_actcov_two = df_max_actcov_two.groupby(pd.Grouper(freq='W-MON', key='date'))[covid_data].max().to_frame(covid_data).reset_index()
df_group_actcov_two = df_group_actcov_two.dropna(thresh=2)

# Merge both groups of dates with Covid-19 data to find the common peaks
df_merge_max_actcov = pd.merge(df_group_actcov, df_group_actcov_two, on='date')

# Display all common peaks of both types of Covid-19 data based on weeks starting on Monday 
df_merge_max_actcov


# PLOT THE DATAFRAME

# Plot the dataframe with the common peaks between the activity and the type of Covid-19 data
fig = px.scatter(df_merge_max_actcov, x='date', y=[act_list[5], covid_data], 
                 title = 'Common peaks between an activity and a type of Covid-19 data',
                 labels={'date': 'Date', 'value': 'Percentage change / number of cases',
                         'variable': 'Type of mobility data / type of Covid-19 data'})

# Move the location of the legend to the upper right corner
fig.update_layout(legend=dict(yanchor='top', y=1.25, xanchor='right', x=1.05))

# Show the graph
fig.show()


# Calculate the covariance matrix
covariance = np.cov(df_merge_max_actcov[act_list[5]], df_merge_max_actcov[covid_data])
print('The covariance matrix is:')
print()
print(covariance)
print()

# Calculate the Pearson's correlation coefficient
pearson, _ = pearsonr(df_merge_max_actcov[act_list[5]], df_merge_max_actcov[covid_data])
print('The Pearson\'s correlation coefficient is: %.3f' % pearson)

The covariance matrix is:

[[  3.46666667 -10.2       ]
 [-10.2        935.2       ]]

The Pearson's correlation coefficient is: -0.179


# Assign the data related to the valleys of the activity to a dataframe
df_min_actcov = df_nz[[act_list[5], 'date']].iloc[valleys_actcov]

# Assign the data related to the valleys of the type of Covid-19 data to a dataframe
df_min_actcov_two = df_nz[[covid_data, 'date']].iloc[valleys_actcov_two]

# Group the dates of the activity on weeks starting on Monday and drop all NaN values
df_min_actcov['date'] = pd.to_datetime(df_min_actcov['date'])
df_group_actcov = df_min_actcov.groupby(pd.Grouper(freq='W-MON', key='date'))[act_list[5]].min().to_frame(act_list[5]).reset_index()
df_group_actcov = df_group_actcov.dropna(thresh=2)

# Group the dates of the type of Covid-19 data on weeks starting on Monday and drop all NaN values
df_min_actcov_two['date'] = pd.to_datetime(df_min_actcov_two['date'])
df_group_actcov_two = df_min_actcov_two.groupby(pd.Grouper(freq='W-MON', key='date'))[covid_data].min().to_frame(covid_data).reset_index()
df_group_actcov_two = df_group_actcov_two.dropna(thresh=2)

# Merge both groups of dates with Covid-19 data to find the common valleys
df_merge_min_actcov = pd.merge(df_group_actcov, df_group_actcov_two, on='date')

# Display all common valleys of both types of Covid-19 data based on weeks starting on Monday 
df_merge_min_actcov


# PLOT THE DATAFRAME

# Plot the dataframe  with the common valleys between the activity and the type of Covid-19 data
fig = px.scatter(df_merge_min_actcov, x='date', y=[act_list[5], covid_data], 
                 title = 'Common valleys between an activity and a type of Covid-19 data',
                 labels={'date': 'Date', 'value': 'Percentage change / number of cases', 
                         'variable': 'Type of mobility data / type of Covid-19 data'})

# Move the location of the legend to the upper right corner
fig.update_layout(legend=dict(yanchor='top', y=1.25, xanchor='right', x=1.05))

# Show the graph
fig.show()


# Read the 2020 and 2021 Google mobility data of Australia as dataframes
df_2020 = pd.read_csv('2020_AU_Region_Mobility_Report.csv')  
df_2021 = pd.read_csv('2021_AU_Region_Mobility_Report.csv')

# Combine the two dataframes into one
df_aus = pd.concat([df_2020,df_2021]).reset_index(drop=True)

# Select only the country data of Australia
df_mobi_aus = df_aus[df_aus['sub_region_1'].isna() & df_aus['sub_region_2'].isna()]

# Clean the data by dropping columns of the dataframe that will not be used
df_mobi_aus = df_mobi_aus.drop(columns=['country_region_code', 'country_region', 'sub_region_1', 
                                        'sub_region_2','metro_area', 'iso_3166_2_code',
                                        'census_fips_code', 'place_id'])

# Select only the Australian Covid-19 data from the OurWorldInData file 
df_covid_aus = df_covid[df_covid.loc[:, 'location'] == 'Australia']

# Clean the data by dropping columns of the dataframe that will not be used
df_covid_aus = df_covid_aus.drop(columns=['iso_code', 'continent', 'location'])


if len(df_mobi_nz) < len(df_mobi_aus):
    df_mobi_aus = df_mobi_aus.iloc[0:len(df_mobi_nz)]
elif len(df_mobi_nz) > len(df_mobi_aus):
    df_mobi_nz = df_mobi_nz.iloc[0:len(df_mobi_aus)]


# Transform the dataframe into a structure that can better be read by plotly.express
nz_names = [nz for nz in df_mobi_nz.columns if nz != 'date']
df_nz_melt = df_mobi_nz.melt(id_vars=['date'], var_name='name', value_vars=nz_names)

# Plot the dataframe with the different types of mobility data
fig = px.line(df_nz_melt, 'date', 'value', color='name', symbol='name', 
              title = 'Mobility data of New Zealand during 2020/2021',
              labels={'date': 'Date', 'value': 'Percentage change', 'name': 'Type of mobility data'})

# Move the location of the legend to the lower right corner
fig.update_layout(legend=dict(yanchor='top', y=1.5, xanchor='right', x=0.99))

# Show the graph
fig.show()


# Make a separate dataframe to exlude the date column in order to perform the computation
df_mobi_nz_pear = df_mobi_nz.iloc[: , 1:]

# Compute the Pearson's correlation coefficient
df_mobi_nz_hm = df_mobi_nz_pear.corr(method='pearson')

# Make a heatmap to display the results
sns.heatmap(df_mobi_nz_hm, annot=True)
plt.title('Pearson\'s correlation coefficient - New Zealand')

Text(0.5, 1.0, "Pearson's correlation coefficient - New Zealand")


# Transform the dataframe into a structure that can better be read by plotly.express
aus_names = [aus for aus in df_mobi_aus.columns if aus != 'date']
df_aus_melt = df_mobi_aus.melt(id_vars=['date'], var_name='name', value_vars=aus_names)

# Plot the dataframe with the different types of mobility data
fig = px.line(df_aus_melt, 'date', 'value', color='name', symbol='name', title = 'Mobility data of Australia during 2020/2021',
              labels={'date': 'Date', 'value': 'Percentage change', 'name': 'Type of mobility data'})

# Move the location of the legend to the lower right corner
fig.update_layout(legend=dict(yanchor='top', y=1.5, xanchor='right', x=0.99))

# Show the graph
fig.show()


# Make a separate dataframe to exlude the date column in order to perform the computation
df_mobi_aus_srcc = df_mobi_aus.iloc[: , 1:]

# Compute the Pearson's correlation coefficient
df_mobi_aus_hm = df_mobi_aus_srcc.corr(method='pearson')

# Make a heatmap to display the results
sns.heatmap(df_mobi_aus_hm, annot=True)
plt.title('Pearson\'s correlation coefficient - Australia')

Text(0.5, 1.0, "Pearson's correlation coefficient - Australia")


# RETAIL AND RECREATION
# Plot the 'Retail and recreation' percentage change of New Zealand and Australia
ax = df_mobi_nz.plot(x = 'date', y = act_list[0], label='New Zealand',
                     figsize=([15,8])) 
df_mobi_aus.plot(x = 'date', y = act_list[0], label='Australia', ax=ax)

# Set the title and labels of x and y-axis
plt.title('Retail and recreation', fontsize=18)
plt.xlabel('Date', fontsize=15)
plt.ylabel('Percentage change from baseline', fontsize=15)

# Set the ticks of the x and y-axis
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Set the x-ticks to the left
plt.setp(ax.xaxis.get_majorticklabels(), ha='left' )

# Set a grid
plt.grid(b=True, which='major', axis='y')

# Change the size and location of the legend
plt.legend(loc=2, prop={'size': 12})

<matplotlib.legend.Legend at 0x2c6029db50>


# GROCERY AND PHARMACY
# Plot the 'Grocery and pharmacy' percentage change of New Zealand and Australia
ax = df_mobi_nz.plot(x = 'date', y = act_list[1], label='New Zealand',
                     figsize=([15,8])) 
df_mobi_aus.plot(x = 'date', y = act_list[1], label='Australia', ax=ax)

# Set the title and labels of x and y-axis
plt.title('Grocery and pharmacy', fontsize=18)
plt.xlabel('Date', fontsize=15)
plt.ylabel('Percentage change from baseline', fontsize=15)

# Set the ticks of the x and y-axis
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Set the x-ticks to the left
plt.setp(ax.xaxis.get_majorticklabels(), ha='left' )

# Set a grid
plt.grid(b=True, which='major', axis='y')

# Change the size and location of the legend
plt.legend(prop={'size': 12})

<matplotlib.legend.Legend at 0x2c60285b80>


# PARKS
# Plot the 'Parks' percentage change of New Zealand and Australia
ax = df_mobi_nz.plot(x = 'date', y =act_list[2], label='New Zealand', figsize=([15,8])) 
df_mobi_aus.plot(x = 'date', y =act_list[2], label='Australia', ax=ax)

# Set the title and labels of x and y-axis
plt.title('Parks', fontsize=18)
plt.xlabel('Date', fontsize=15)
plt.ylabel('Percentage change from baseline', fontsize=15)

# Set the ticks of the x and y-axis
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Set the x-ticks to the left
plt.setp(ax.xaxis.get_majorticklabels(), ha='left' )

# Set a grid
plt.grid(b=True, which='major', axis='y')

# Change the size and location of the legend
plt.legend(prop={'size': 12})

<matplotlib.legend.Legend at 0x2c5fa1a940>


# PUBLIC TRANSPORT
# Plot the 'Public transport' percentage change of New Zealand and Australia
ax = df_mobi_nz.plot(x = 'date', y =act_list[3], label='New Zealand', figsize=([15,8])) 
df_mobi_aus.plot(x = 'date', y =act_list[3], label='Australia', ax=ax)

# Set the title and labels of x and y-axis
plt.title('Public transport', fontsize=18)
plt.xlabel('Date', fontsize=15)
plt.ylabel('Percentage change from baseline', fontsize=15)

# Set the ticks of the x and y-axis
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Set the x-ticks to the left
plt.setp(ax.xaxis.get_majorticklabels(), ha='left' )

# Set a grid
plt.grid(b=True, which='major', axis='y')

# Change the size and location of the legend
plt.legend(prop={'size': 12})

<matplotlib.legend.Legend at 0x2c601ddf70>


# WORKPLACES
# Plot the 'Workplaces' percentage change of New Zealand and Australia
ax = df_mobi_nz.plot(x = 'date', y =act_list[4], label='New Zealand', figsize=([15,8])) 
df_mobi_aus.plot(x = 'date', y =act_list[4], label='Australia', ax=ax)

# Set the title and labels of x and y-axis
plt.title('Workplaces', fontsize=18)
plt.xlabel('Date', fontsize=15)
plt.ylabel('Percentage change from baseline', fontsize=15)

# Set the ticks of the x and y-axis
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Set the x-ticks to the left
plt.setp(ax.xaxis.get_majorticklabels(), ha='left' )

# Set a grid
plt.grid(b=True, which='major', axis='y')

# Change the size and location of the legend
plt.legend(loc=4, prop={'size': 12})

<matplotlib.legend.Legend at 0x2c63e2a280>


# RESIDENTIAL
# Plot the 'Residential' percentage change of New Zealand and Australia
ax = df_mobi_nz.plot(x = 'date', y =act_list[5], label='New Zealand', figsize=([15,8])) 
df_mobi_aus.plot(x = 'date', y =act_list[5], label='Australia', ax=ax)

# Set the title and labels of x and y-axis
plt.title('Residential', fontsize=18)
plt.xlabel('Date', fontsize=15)
plt.ylabel('Percentage change from baseline', fontsize=15)

# Set the ticks of the x and y-axis
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Set the x-ticks to the left
plt.setp(ax.xaxis.get_majorticklabels(), ha='left' )

# Set a grid
plt.grid(b=True, which='major', axis='y')

# Change the size and location of the legend
plt.legend(prop={'size': 12})

<matplotlib.legend.Legend at 0x2c63e2a5b0>


# Calculate the Pearson's correlation coefficient
pear = []
for i in act_list:
    pearson, _ = pearsonr(df_mobi_nz[i], df_mobi_aus[i])
    pear.append(pearson)

# Convert the list with activities and the list with the Spearman's coefficients to a dataframe
df_stat = pd.DataFrame({'Activity':act_list, 'Pearson\'s correlation coefficient':pear})

# Display the dataframe
df_stat


if len(df_covid_nz) < len(df_covid_aus):
    df_covid_aus = df_covid_aus.iloc[0:len(df_covid_nz)]
elif len(df_covid_nz) > len(df_covid_aus):
    df_covid_nz = df_covid_nz.iloc[0:len(df_covid_aus)]


cov_list = ['total_cases_per_million', 'total_deaths_per_million', 'stringency_index']


# NEW ZEALAND
# Select the date column and the first parameter of the cov_list into a new dataframe
df_cov_nz_one = df_covid_nz[['date', cov_list[0]]]

# Change the name of the first parameter of the cov_list
df_cov_nz_one.rename({cov_list[0]: 'New Zealand'},
                     axis=1, inplace=True)

# Transform the dataframe into a structure that can better be read by plotly.express
nz_names_one = [nz for nz in df_cov_nz_one.columns if nz != 'date']
nz_melt_one = df_cov_nz_one.melt(id_vars=['date'], var_name='name', value_vars=nz_names_one)

# AUSTRALIA
# Select the date column and the first parameter of the cov_list into a new dataframe
df_cov_aus_one = df_covid_aus[['date', cov_list[0]]]

# Change the name of the first parameter of the cov_list
df_cov_aus_one.rename({cov_list[0]: 'Australia'},
                     axis=1, inplace=True)

# Transform the dataframe into a structure that can better be read by plotly.express
aus_names_one = [aus for aus in df_cov_aus_one.columns if aus != 'date']
aus_melt_one = df_cov_aus_one.melt(id_vars=['date'], var_name='name', value_vars=aus_names_one)

# MERGE THE TWO DATAFRAMES INTO ONE
df_nz_aus_one= pd.concat([nz_melt_one, aus_melt_one])

# PLOT THE DATAFRAME
# Plot the dataframe with the different types of mobility data
fig = px.line(df_nz_aus_one, 'date', 'value', color='name', symbol='name', 
              title = 'Total cases per million inhabitants',
              labels={'date': 'Date', 'value': 'Number of cases per million inhabitants', 'name': 'Country'})

# Move the location of the legend to the lower right corner
fig.update_layout(legend=dict(yanchor='top', y=1.25, xanchor='right', x=0.99))

# Show the graph
fig.show()

C:\Users\sila\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\frame.py:5039: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# NEW ZEALAND
# Select the date column and the second parameter of the cov_list into a new dataframe
df_cov_nz_two = df_covid_nz[['date', cov_list[1]]]

# Change the name of the second parameter of the cov_list
df_cov_nz_two.rename({cov_list[1]: 'New Zealand'},
                     axis=1, inplace=True)

# Transform the dataframe into a structure that can better be read by plotly.express
nz_names_two = [nz for nz in df_cov_nz_two.columns if nz != 'date']
nz_melt_two = df_cov_nz_two.melt(id_vars=['date'], var_name='name', value_vars=nz_names_two)

# AUSTRALIA
# Select the date column and the second parameter of the cov_list into a new dataframe
df_cov_aus_two = df_covid_aus[['date', cov_list[1]]]

# Change the name of the second parameter of the cov_list
df_cov_aus_two.rename({cov_list[1]: 'Australia'},
                     axis=1, inplace=True)

# Transform the dataframe into a structure that can better be read by plotly.express
aus_names_two = [aus for aus in df_cov_aus_two.columns if aus != 'date']
aus_melt_two = df_cov_aus_two.melt(id_vars=['date'], var_name='name', value_vars=aus_names_two)

# MERGE THE TWO DATAFRAMES INTO ONE
df_nz_aus_two= pd.concat([nz_melt_two, aus_melt_two])

# PLOT THE DATAFRAME
# Plot the dataframe with the different types of mobility data
fig = px.line(df_nz_aus_two, 'date', 'value', color='name', symbol='name', 
              title = 'Total deaths per millions inhabitants',
              labels={'date': 'Date', 'value': 'Number of deaths per million inhabitants', 'name': 'Country'})

# Move the location of the legend to the lower right corner
fig.update_layout(legend=dict(yanchor='top', y=1.25, xanchor='right', x=0.99))

# Show the graph
fig.show()


# NEW ZEALAND
# Select the date column and the third parameter of the cov_list into a new dataframe
df_cov_nz_three = df_covid_nz[['date', cov_list[2]]]

# Change the name of the third parameter of the cov_list
df_cov_nz_three.rename({cov_list[2]: 'New Zealand'},
                     axis=1, inplace=True)

# Transform the dataframe into a structure that can better be read by plotly.express
nz_names_three = [nz for nz in df_cov_nz_three.columns if nz != 'date']
nz_melt_three = df_cov_nz_three.melt(id_vars=['date'], var_name='name', value_vars=nz_names_three)

# AUSTRALIA
# Select the date column and the first parameter of the cov_list into a new dataframe
df_cov_aus_three = df_covid_aus[['date', cov_list[2]]]

# Change the name of the third parameter of the cov_list
df_cov_aus_three.rename({cov_list[2]: 'Australia'},
                     axis=1, inplace=True)

# Transform the dataframe into a structure that can better be read by plotly.express
aus_names_three = [aus for aus in df_cov_aus_three.columns if aus != 'date']
aus_melt_three = df_cov_aus_three.melt(id_vars=['date'], var_name='name', value_vars=aus_names_three)

# MERGE THE TWO DATAFRAMES INTO ONE
df_nz_aus_three= pd.concat([nz_melt_three, aus_melt_three])

# PLOT THE DATAFRAME
# Plot the dataframe with the different types of mobility data
fig = px.line(df_nz_aus_three, 'date', 'value', color='name', symbol='name', 
              title = 'Stringency index',
              labels={'date': 'Date', 'value': '-', 'name': 'Country'})


# Move the location of the legend to the lower right corner
fig.update_layout(legend=dict(yanchor='top', y=1.25, xanchor='right', x=0.99))

# Show the graph
fig.show()


# NEW ZEALAND
# Group the dates of the Covid-19 data of New Zealand on weeks starting on Monday and drop all NaN values
df_covid_nz['date'] = pd.to_datetime(df_covid_nz['date'])
df_cov_nz_zero  = df_covid_nz.groupby(pd.Grouper(freq='W-MON', key='date'))[cov_list[0]].min().to_frame(cov_list[0]).reset_index()
df_cov_nz_one  = df_covid_nz.groupby(pd.Grouper(freq='W-MON', key='date'))[cov_list[1]].min().to_frame(cov_list[1]).reset_index()
df_cov_nz_two  = df_covid_nz.groupby(pd.Grouper(freq='W-MON', key='date'))[cov_list[2]].min().to_frame(cov_list[2]).reset_index()

# Merge the grouped data into one dataframe
df_cov_nz_gro = pd.merge(df_cov_nz_zero, df_cov_nz_one, on='date')
df_cov_nz_gro = pd.merge(df_cov_nz_gro, df_cov_nz_two, on='date')

# Assign a new column with the value of the country 'New Zealand'
df_cov_nz_gro = df_cov_nz_gro.assign(country = 'New Zealand')

# Fill in all NaN values with zero
df_cov_nz_gro[cov_list[1]] = df_cov_nz_gro[cov_list[1]].fillna(0)

# Change the value of one cell with NaN to specific value 
# Since it is only one day, I thought I could fill in the value with that of the day before or after (this was the same value)
# The one missing value of Stringency Index occurred only in the dataset of New Zealand
df_cov_nz_gro[cov_list[2]] = df_cov_nz_gro[cov_list[2]].fillna(81.02)

# AUSTRALIA
# Group the dates of the Covid-19 data of Australia on weeks starting on Monday and drop all NaN values
df_covid_aus['date'] = pd.to_datetime(df_covid_aus['date'])
df_cov_aus_zero  = df_covid_aus.groupby(pd.Grouper(freq='W-MON', key='date'))[cov_list[0]].min().to_frame(cov_list[0]).reset_index()
df_cov_aus_one  = df_covid_aus.groupby(pd.Grouper(freq='W-MON', key='date'))[cov_list[1]].min().to_frame(cov_list[1]).reset_index()
df_cov_aus_two  = df_covid_aus.groupby(pd.Grouper(freq='W-MON', key='date'))[cov_list[2]].min().to_frame(cov_list[2]).reset_index()

# Merge the grouped data into one dataframe
df_cov_aus_gro = pd.merge(df_cov_aus_zero, df_cov_aus_one, on='date')
df_cov_aus_gro = pd.merge(df_cov_aus_gro, df_cov_aus_two, on='date')

# Assign a new column with the value of the country 'Australia'
df_cov_aus_gro = df_cov_aus_gro.assign(country = 'Australia')

# Fill in all NaN values with zero
df_cov_aus_gro[cov_list[1]] = df_cov_aus_gro[cov_list[1]].fillna(0)

# MERGE THE TWO DATAFRAMES INTO ONE
# Concatenate both dataframes into one dataframe
df_nz_aus= pd.concat([df_cov_nz_gro, df_cov_aus_gro])
df_nz_aus['date'] = df_nz_aus['date'].astype(str)


# Plot the dataframe with the stringency index, total cases per million inhabitants, total deaths per million inhabitants 
# of New Zealand and Australia
fig = px.scatter(df_nz_aus, x=cov_list[0], y=cov_list[1], animation_frame='date', 
                 animation_group='country', size=cov_list[2], color='country', hover_name='country', size_max=50, 
                 range_x=[-200,3600], range_y=[-10,50], title = 'Stringency index of New Zealand and Australia',
                 labels={cov_list[0]: 'Total cases per million inhabitants', 
                         cov_list[1]:'Total deaths per million inhabitants', 
                         'country': 'Country'})
# Show the graph
fig.show()

	date	retail_and_recreation_percent_change_from_baseline	grocery_and_pharmacy_percent_change_from_baseline
0	2021-04-01	22.0	40.0
1	2021-04-26	19.0	12.0
2	2021-05-08	22.0	15.0
3	2021-05-15	21.0	11.0
4	2021-05-20	21.0	11.0
5	2021-06-05	23.0	15.0
6	2021-06-10	20.0	11.0
7	2021-06-17	18.0	10.0
8	2021-07-03	22.0	12.0
9	2021-07-10	22.0	13.0
10	2021-07-15	16.0	15.0
11	2021-07-22	20.0	15.0
12	2021-07-31	20.0	13.0
13	2021-08-07	21.0	13.0
14	2021-08-12	22.0	14.0

	date	retail_and_recreation_percent_change_from_baseline	grocery_and_pharmacy_percent_change_from_baseline
0	2020-03-26	-92.0	-58.0
1	2020-04-10	-95.0	-91.0
2	2020-04-25	-89.0	-53.0
3	2020-05-03	-87.0	-38.0
4	2020-05-25	-29.0	-16.0
5	2020-05-31	-29.0	-13.0
6	2020-06-08	-21.0	-14.0
7	2020-06-24	-16.0	-10.0
8	2020-06-29	-17.0	-13.0
9	2020-08-03	-10.0	-10.0
10	2020-08-30	-25.0	-11.0
11	2020-09-07	-13.0	-10.0
12	2020-12-25	-83.0	-89.0
13	2021-01-01	-32.0	-26.0
14	2021-02-15	-27.0	-13.0
15	2021-04-02	-45.0	-68.0

	date	new_tests	new_cases
0	2020-03-22	2192.0	50.0
1	2020-09-30	6359.0	12.0
2	2021-07-12	15348.0	18.0
3	2021-09-21	19194.0	24.0

	date	residential_percent_change_from_baseline	new_cases
0	2020-03-30	38.0	85.0
1	2020-04-06	37.0	89.0
2	2020-04-13	42.0	44.0
3	2020-04-20	37.0	20.0
4	2021-08-23	38.0	32.0
5	2021-08-30	38.0	84.0

General requirements for the assignment¶

DEADLINE FOR THIS ASSIGNMENT IS 29 OCTOBER 2021 BEFORE 23:59¶

Assignment¶

Import libraries ¶

Part I - Data import¶

Data origin and location ¶

1. Create the new dataframe ¶

Explanation of the new dataframe 'df_nz' ¶

Part II - Data processing¶

2. Explanation and pseudo-code of the offset algorithm ¶

Algorithm method to find the common peaks and valleys ¶

3. Common peaks and valleys between two activities in New Zealand ¶

Find peaks and valleys of the two activities ¶

Find the common peaks ¶

Find the common valleys ¶

4. Common peaks and valleys between two types of Covid-19 data in New Zealand ¶

Find peaks and valleys of two types of Covid-19 data ¶

Find the common peaks ¶

Find the common valleys ¶

5. Compare peaks of one activity and one type of Covid-data in New Zealand ¶

Built-in method to find the common peaks / valleys ¶

Find peaks and valleys of the activity and the type of Covid-19 data ¶

Find the common peaks ¶

Visual analysis of the common peaks ¶

Statistical analysis of common peaks ¶

Find the common valleys ¶

Part III - Data visualisation¶

6. Compare New Zealand and Australia ¶

Prepare the data from Australia ¶

Comparison of mobility data between New Zealand and Australia ¶

Comparison of Covid-19 data between New Zealand and Australia ¶

Rubrics¶

Overall grading¶

Rubrics for each question in the assignment¶

	Activity	Pearson's correlation coefficient
0	retail_and_recreation_percent_change_from_base...	0.764589
1	grocery_and_pharmacy_percent_change_from_baseline	0.758653
2	parks_percent_change_from_baseline	0.488620
3	transit_stations_percent_change_from_baseline	0.721324
4	workplaces_percent_change_from_baseline	0.677701
5	residential_percent_change_from_baseline	0.752879