import os
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import itertools
import collections
import math
import pprint
from statistics import mean, variance, pvariance
try:
from google.colab import drive
drive.mount('/content/gdrive')
root_path = './'
except:
root_path = './'
cases_df = pd.read_csv(root_path + '/data/COVID-19_Cases_Summarized_by_Date__Transmission_and_Case_Disposition.csv')
cases_df.isna().sum()
We can verify in above query, 0 MISSING values were found
cases_df["Date"] = pd.to_datetime(cases_df["Date"], format="%Y/%m/%d")
# Detect outliers for each type of data
types = ['Confirmed', 'Death']
outliers = {}
f, axes = plt.subplots(1, 2, figsize=[14, 6])
for i, t in enumerate(types):
curr_df = cases_df[cases_df['Case Disposition'] == t].groupby(['Date'])['Case Count'].sum().reset_index()
curr_ds = curr_df['Case Count']
sns.boxplot(curr_ds, orient='v', ax=axes[i]).set(xlabel=t)
Q1 = np.percentile(curr_ds, 25)
Q3 = np.percentile(curr_ds, 75)
IQR = Q3 - Q1
outlier_step = 1.5 * IQR
outliers[t] = curr_df[(curr_ds < Q1 - outlier_step) | (curr_ds > Q3 + outlier_step)]
# Drop outlier rows
for t in outliers.keys():
for d in set(outliers[t]['Date']):
cases_df.drop(cases_df[(cases_df['Case Disposition'] == t) & (cases_df['Date'] == d)].index, inplace=True)
crime_df = pd.read_csv(root_path + '/data/Police_Department_Incident_Reports__2018_to_Present.csv')
crime_df.isna().sum()
columns_to_consider = ["Incident Datetime",
"Incident Date",
"Incident Time",
"Incident Year",
"Incident Day of Week",
"Report Datetime",
"Row ID",
"Incident ID",
"Incident Number",
"Report Type Code",
"Report Type Description",
"Incident Code",
"Incident Category",
"Incident Subcategory",
"Incident Description",
"Resolution",
"Police District"]
crime_df = crime_df[columns_to_consider]
crime_df = crime_df[crime_df["Incident Category"].notna()]
crime_df["Incident Date"] = pd.to_datetime(crime_df["Incident Date"], format="%Y/%m/%d")
crime_df["Incident Datetime"] = pd.to_datetime(crime_df["Incident Datetime"], format="%Y/%m/%d %I:%M:%S %p")
def convert_agg_df_into_flatten_df(df):
df = df.reset_index()
df.columns = [' '.join(col).strip() for col in df.columns.values]
return df
For smoothening purposes used Simple Moving Average spanning over a week
unique_crimes = crime_df.groupby(["Incident ID"]).agg({"Incident Date": "first"}).reset_index()
unique_crimes.head()
date_freq = unique_crimes["Incident Date"].value_counts().to_frame().reset_index()
date_freq = date_freq.rename(columns={"index": "date", "Incident Date": "count"})
date_freq = date_freq.sort_values(by="date")
figure, axes = plt.subplots(figsize=(20, 10), nrows=2, ncols=1)
sns.lineplot(x='date', y='count', data=date_freq, ax=axes[0])
date_freq['smoothen_counts'] = date_freq['count'].rolling(window=7).mean()
sns_axes = sns.lineplot(x='date', y='smoothen_counts', data=date_freq, ax=axes[1])
# Adding x-ticks for each month
base_date = datetime.datetime.strptime('2018-01-01', '%Y-%m-%d')
dates = [base_date + datetime.timedelta(days=i * 30) for i in range(30)]
xfmt = mdates.DateFormatter('%Y-%m')
axes[1].xaxis.set_major_formatter(xfmt)
axes[1].set_xticks(dates)
x_dates = [date.strftime("%Y-%m") for date in dates]
remnant = axes[1].set_xticklabels(labels=x_dates, rotation=90, ha='right')
As we can see there's huge drop after first week of March 2020. Because the first case of corona in San-Francisco was observed in 5th March, after which social-distancing measures were implemented
See changes in trends of crime rates based on type of crime and filing status
vehicle_unique_reports = crime_df[(crime_df["Report Type Description"] == "Vehicle Initial") |
(crime_df["Report Type Description"] == "Vehicle Supplement")]
vehicle_unique_reports = vehicle_unique_reports.groupby(["Incident ID"]).agg({"Incident Date": "first"}).reset_index()
online_unique_reports = crime_df[(crime_df["Report Type Description"] == "Coplogic Initial") |
(crime_df["Report Type Description"] == "Coplogic Supplement")]
online_unique_reports = online_unique_reports.groupby(["Incident ID"]).agg({"Incident Date": "first"}).reset_index()
vehicle_stats = vehicle_unique_reports["Incident Date"].value_counts().to_frame().reset_index()
vehicle_stats = vehicle_stats.rename(columns={"index": "date", "Incident Date": "count"})
vehicle_stats = vehicle_stats.sort_values(by="date")
vehicle_stats['smoothen_counts'] = vehicle_stats['count'].rolling(window=7).mean()
online_stats = online_unique_reports["Incident Date"].value_counts().to_frame().reset_index()
online_stats = online_stats.rename(columns={"index": "date", "Incident Date": "count"})
online_stats = online_stats.sort_values(by="date")
online_stats['smoothen_counts'] = online_stats['count'].rolling(window=7).mean()
vehicle_stats["type"] = "vehicle"
online_stats["type"] = "online"
merged_df = pd.concat([vehicle_stats, online_stats])
merged_df = merged_df[merged_df["date"] > datetime.datetime(2019, 12, 31)]
figure, axes = plt.subplots(figsize=(20, 10), nrows=2, ncols=1)
sns.lineplot(x='date', y='count', hue="type", data=merged_df, ax=axes[0])
sns_axes = sns.lineplot(x='date', y='smoothen_counts', hue="type", data=merged_df, ax=axes[1])
As we can observe that the crime reports that are filed online are reduce but the crime reports filed regarding stolen vehicle are more or less same.
The reason behind this is that most of the online crime reports filed by citizens are related to Lost property or Theft of Property News Source
Also, in subsequent visualizations we find that Larceny Theft is the most frequent crime that is dropped.
In summary, the most frequent crime Larceny (Theft of Property) which is filed online is dropped after corona has begun. May be this is because most of the people don't leave their houses so that they could be criminally occupied
merged_df = merged_df[merged_df["date"] > datetime.datetime(2019, 12, 31)]
figure, axes = plt.subplots(figsize=(20, 10), nrows=2, ncols=1)
sns.lineplot(x='date', y='count', hue="type", data=merged_df, ax=axes[0])
sns_axes = sns.lineplot(x='date', y='smoothen_counts', hue="type", data=merged_df, ax=axes[1])
We can observe that the crime reports that are filed online have reduced and the crime reports filed related to vehicles are more or less same. We expected online filing of crimes to increase during covid/lockdown period but that is not the case here. We also expected vehicle crimes to decrease since people would not be using their vehicles anymore but that does not seem to be the case either.
We look at some prominent types of crime incidents that are filed and observe their trends.
concern_cases = ["Larceny Theft", "Malicious Mischief", "Burglary",
"Motor Vehicle Theft", "Traffic Violation Arrest",
"Civil Sidewalks", "Traffic Collision", "Vandalism"]
df_list = []
for case in concern_cases:
case_level_df = crime_df[crime_df["Incident Category"] == case]
case_level_df = case_level_df.groupby(["Incident ID"]).agg({"Incident Date": "first"}).reset_index()
case_level_df = case_level_df["Incident Date"].value_counts().to_frame().reset_index()
case_level_df = case_level_df.rename(columns={"index": "date", "Incident Date": "count"})
case_level_df = case_level_df.sort_values(by="date")
case_level_df['smoothen_counts'] = case_level_df['count'].rolling(window=14).mean()
case_level_df['type'] = case
df_list.append(case_level_df)
merged_df = pd.concat(df_list)
merged_df = merged_df[merged_df["date"] > datetime.datetime(2019, 12, 31)]
figure, axes = plt.subplots(figsize=(20, 20), nrows=2, ncols=1)
sns.lineplot(x='date', y='count', hue='type', data=merged_df, ax=axes[0])
sns_axes = sns.lineplot(x='date', y='smoothen_counts', hue='type', data=merged_df, ax=axes[1])
# Group the data by Case Disposition(Death, New Case) and Date
cases_df_grouped = cases_df.groupby(by=['Case Disposition', 'Date'])
# Search for the start date and end date in the data
start_date = cases_df['Date'].min()
end_date = cases_df['Date'].max()
# Calculate the total number of days based on start date and end date
num_of_days = end_date - start_date
num_of_days = num_of_days.days + 1
# Prepare a list of number of cases and deaths where index = index of the day from start date
num_cases_list = [ 0 for i in range(num_of_days)]
deaths_list = [0 for i in range(num_of_days)]
# Iterate through the groups of data to extract number of cases and fatalities
# to store in a list
for name, group in cases_df_grouped:
if name[0] == 'Confirmed':
num_cases_list[(group['Date'].min() - start_date).days] = group['Case Count'].sum()
else:
deaths_list[(group['Date'].min()- start_date).days] = group['Case Count'].sum()
# Function to predict Auto Regression (AR) prediction values
def predict_ar(AR_p, day_of_the_week):
training_data_count = num_of_days - 7 - AR_p + day_of_the_week
x_num_cases = [[0 for i in range(AR_p+1)] for i in range(training_data_count)]
y_num_cases = [ 0 for i in range(training_data_count) ]
x_deaths = [[0 for i in range(AR_p+1)] for i in range(training_data_count)]
y_deaths = [ 0 for i in range(training_data_count) ]
for i in range(training_data_count):
x_num_cases[i][0] = 1
y_num_cases[i] = num_cases_list[i+AR_p]
x_deaths[i][0] = 1
y_deaths[i] = deaths_list[i+AR_p]
for j in range(AR_p):
x_num_cases[i][AR_p - j] = num_cases_list[i+j]
x_deaths[i][AR_p - j] = deaths_list[i+j]
# Prepare the X-matrix for prediction for number of casees and deaths
x_pred_num_of_cases = [0 for i in range(AR_p+1)]
x_pred_num_of_cases[0] = 1
x_pred_deaths = [0 for i in range(AR_p+1)]
x_pred_deaths[0] = 1
for i in range(AR_p):
x_pred_num_of_cases[i+1] = num_cases_list[training_data_count+AR_p-1-i]
x_pred_deaths[i+1] = deaths_list[training_data_count+AR_p-1-i]
x_num_cases = np.array(x_num_cases)
x_deaths = np.array(x_deaths)
beta_num_cases = [0 for i in range(AR_p+1)]
x_num_cases_transpose = np.matrix.transpose(x_num_cases)
# Calculate the beta matrix for number of cases by the formula: beta = (inverse(X_transpose*X))*(X_transpose)*(Y)
beta_num_cases = np.dot(np.dot(np.linalg.inv(np.dot(x_num_cases_transpose, x_num_cases)), x_num_cases_transpose), y_num_cases)
beta_deaths = [0 for i in range(AR_p+1)]
x_deaths_transpose = np.matrix.transpose(x_deaths)
# Calculate the beta matrix for deaths by the formula: beta = (inverse(X_transpose*X))*(X_transpose)*(Y)
beta_deaths = np.dot(np.dot(np.linalg.inv(np.dot(x_deaths_transpose, x_deaths)), x_deaths_transpose), y_deaths)
# Calculate the predicted number of cases and deaths by the formula: Y = beta*X
y_pred_num_cases = np.dot(x_pred_num_of_cases, beta_num_cases)
y_pred_deaths = np.dot(x_pred_deaths, beta_deaths)
return y_pred_num_cases, y_pred_deaths, num_cases_list[training_data_count+AR_p], deaths_list[training_data_count+AR_p]
# Function to predict the EWMA estimates
def predict_ewma(alpha):
y_pred_num_cases = [0 for i in range(len(num_cases_list))]
y_pred_deaths = [0 for i in range(len(deaths_list))]
# Calculate the initial predicted value as the actual value itself
y_pred_num_cases[0] = num_cases_list[0]
y_pred_deaths[0] = deaths_list[0]
# For subsequent predictions, use the formula: alpha*(y_t) + (1-alpha)*(y_hat_t)
for i in range(1,len(num_cases_list)):
y_pred_num_cases[i] = round((alpha*num_cases_list[i-1] + (1-alpha)*y_pred_num_cases[i-1]), 2)
y_pred_deaths[i] = round((alpha*deaths_list[i-1] + (1-alpha)*y_pred_deaths[i-1]), 2)
return y_pred_num_cases[-7:], y_pred_deaths[-7:]
# Function to calculate MAPE and MSE values
def calculate_mape_and_mse(y_pred_num_cases, y_num_cases, y_pred_deaths, y_deaths):
mape_num_cases = 0
mse_num_cases = 0
mape_deaths = 0
mse_deaths = 0
for i in range(len(y_num_cases)):
# Calculate MAPE for number of cases
# Drop the cases when the actual value is zero
if y_num_cases[i] != 0:
mape_num_cases = mape_num_cases + (abs(y_num_cases[i] - y_pred_num_cases[i]))/y_num_cases[i]
# Calculate MAPE for deaths
# Drop the cases when the actual value is zero
if y_deaths[i] != 0:
mape_deaths = mape_deaths + (abs(y_deaths[i] - y_pred_deaths[i]))/y_deaths[i]
# Calculate MSE for number of cases
mse_num_cases = mse_num_cases + (y_num_cases[i] - y_pred_num_cases[i])*(y_num_cases[i] - y_pred_num_cases[i])
# Calcuate MSE for deaths
mse_deaths = mse_deaths + (y_deaths[i] - y_pred_deaths[i])*(y_deaths[i] - y_pred_deaths[i])
mape_num_cases = (mape_num_cases*100)/len(y_num_cases)
mape_deaths = (mape_deaths*100)/len(y_num_cases)
mse_num_cases = mse_num_cases/len(y_num_cases)
mse_deaths - mse_deaths/len(y_num_cases)
mape_list = []
mape_list.append(mape_num_cases)
mape_list.append(mape_deaths)
mse_list = []
mse_list.append(mse_num_cases)
mse_list.append(mse_deaths)
return mape_list, mse_list
# Make empty lists to store the actual and predicted values for fatalities and number
# of cases for last 1 week for AR(3), AR(5), EWMA(0.5), EWMA(0.8)
y_num_cases_7_days = []
y_deaths_7_days = []
y_pred_num_cases_7_days_ar_3 = []
y_pred_deaths_7_days_ar_3 = []
y_pred_num_cases_7_days_ar_5 = []
y_pred_deaths_7_days_ar_5 = []
y_pred_num_cases_ewma_5 = []
y_pred_deaths_ewma_5 = []
y_pred_num_cases_ewma_8 = []
y_pred_deaths_ewma_8 = []
# Call the functions to get the results
for i in range(7):
# Get the predictions for AR(3) for fatalities and number of cases for last 1 week
results = predict_ar(3, i)
y_pred_num_cases_7_days_ar_3.append(round(results[0],2))
y_pred_deaths_7_days_ar_3.append(round(results[1],2))
y_num_cases_7_days.append(results[2])
y_deaths_7_days.append(results[3])
# Get the predictions for AR(5) for fatalities and number of cases for last 1 week
results_1 = predict_ar(5, i)
y_pred_num_cases_7_days_ar_5.append(round(results_1[0],2))
y_pred_deaths_7_days_ar_5.append(round(results_1[1],2))
# Get the predictions for EWMA(alpha=0.5) for fatalities and number of cases for last 1 week
y_pred_num_cases_ewma_5, y_pred_deaths_ewma_5 = predict_ewma(0.5)
# Get the predictions for EWMA(alpha=0.8) for fatalities and number of cases for last 1 week
y_pred_num_cases_ewma_8, y_pred_deaths_ewma_8 = predict_ewma(0.8)
# Calculate MAPE and MSE values for AR(3)
mape_ar_3, mse_ar_3 = calculate_mape_and_mse(y_pred_num_cases_7_days_ar_3, y_num_cases_7_days, y_pred_deaths_7_days_ar_3, y_deaths_7_days)
# Calculate MAPE and MSE values for AR(5)
mape_ar_5, mse_ar_5 = calculate_mape_and_mse(y_pred_num_cases_7_days_ar_5, y_num_cases_7_days, y_pred_deaths_7_days_ar_5, y_deaths_7_days)
# Calculate MAPE and MSE values for EWMA with alpha 0.5
mape_ewma_5, mse_ewma_5 = calculate_mape_and_mse(y_pred_num_cases_ewma_5, y_num_cases_7_days, y_pred_deaths_ewma_5, y_deaths_7_days)
# Calculate MAPE and MSE values for EWMA with alpha 0.8
mape_ewma_8, mse_ewma_8 = calculate_mape_and_mse(y_pred_num_cases_ewma_8, y_num_cases_7_days, y_pred_deaths_ewma_8, y_deaths_7_days)
print("Actual number of new cases for the last week(day1 - day7): "+str(y_num_cases_7_days))
print("Actual number of deaths for the last week: "+ str(y_deaths_7_days))
print()
print("Autoregression(3) number of cases prediction for the last week: "+str(y_pred_num_cases_7_days_ar_3))
print("Autoregression(3) deaths prediction for the last week: "+str(y_pred_deaths_7_days_ar_3))
print()
print("Autoregression(5) number of cases prediction for the last week: "+str(y_pred_num_cases_7_days_ar_5))
print("Autoregression(5) deaths prediction for the last week: "+str(y_pred_deaths_7_days_ar_5))
print()
print("EWMA with alpha 0.5 number of cases prediction for the last week: "+ str(y_pred_num_cases_ewma_5))
print("EWMA with alpha 0.5 deaths prediction for the last week: "+str(y_pred_deaths_ewma_5))
print()
print("EWMA with alpha 0.8 number of cases prediction for the last week: "+str(y_pred_num_cases_ewma_8))
print("EWMA with alpha 0.8 deaths prediction for the last week: "+str(y_pred_deaths_ewma_8))
print()
print("MAPE for AR(3) for number of cases: "+str(round(mape_ar_3[0],2)))
print("MAPE for AR(3) for deaths: "+str(round(mape_ar_3[1],2)))
print("MSE for AR(3) for number of cases: "+str(round(mse_ar_3[0],2)))
print("MSE for AR(3) for deaths: "+str(round(mse_ar_3[1],2)))
print()
print("MAPE for AR(5) for number of cases: "+str(round(mape_ar_5[0],2)))
print("MAPE for AR(5) for deaths: "+str(round(mape_ar_5[1],2)))
print("MSE for AR(5) for number of cases: "+str(round(mse_ar_5[0],2)))
print("MSE for AR(5) for deaths: "+str(round(mse_ar_5[1],2)))
print()
print("MAPE for EWMA with alpha 0.5 for number of cases: "+str(round(mape_ewma_5[0],2)))
print("MAPE for EWMA with alpha 0.5 for deaths: "+str(round(mape_ewma_5[1],2)))
print("MSE for EWMA with alpha 0.5 for number of cases: "+str(round(mse_ewma_5[0],2)))
print("MSE for EWMA with alpha 0.5 for deaths: "+str(round(mse_ewma_5[1],2)))
print()
print("MAPE for EWMA with alpha 0.8 for number of cases: "+str(round(mape_ewma_8[0],2)))
print("MAPE for EWMA with alpha 0.8 for deaths: "+str(round(mape_ewma_8[1],2)))
print("MSE for EWMA with alpha 0.8 for number of cases: "+str(round(mse_ewma_8[0],2)))
print("MSE for EWMA with alpha 0.8 for deaths: "+str(round(mse_ewma_8[1],2)))
There were a couple of days in Number of Cases data, where there was no entry. There were many entries in Deaths data for which there was no entry. We have assumed zero cases and deaths for such days and made predictions accordingly.
29, 50, 38, 73, 40, 6, 1
1, 2, 1, 0, 0, 0, 0
33.75, 34.4, 36.85, 35.97, 49.12, 45.97, 35.07
0.67, 0.38, 0.49, 0.6, 0.9, 0.63, 0.27
35.13, 34.12, 36.5, 37.91, 45.28, 43.06, 38.19
0.67, 0.33, 0.49, 0.46, 0.83, 0.74, 0.48
36.8, 32.9, 41.45, 39.72, 56.36, 48.18, 27.09
0.14, 0.57, 1.28, 1.14, 0.57, 0.28, 0.14
36.62, 30.52, 46.1, 39.62, 66.32, 45.26, 13.85
0.03, 0.81, 1.76, 1.15, 0.23, 0.05, 0.01
(Note: Records where actual value is 0 is disregarded in MAPE calculation)
MAPE for number of cases: 599.61
MAPE for deaths: 23.57
MSE for number of cases: 640.0
MSE for deaths: 4.63
MAPE for number of cases: 636.4
MAPE for deaths: 23.93
MSE for number of cases: 615.39
MSE for deaths: 4.84
MAPE for number of cases: 495.52
MAPE for deaths: 26.5
MSE for number of cases: 600.03
MSE for deaths: 4.59
MAPE for number of cases: 305.34
MAPE for deaths: 33.21
MSE for number of cases: 573.8
MSE for deaths: 4.31
For AR(3) and AR(5), we do not see a considerable difference in predictions for both number of cases and deaths. The MAPE and MSE for those techniques are also very similar. The EWMA (alpha=0.5) and EWMA(alpha=0.8) also predict similar values for most of the number of cases and deaths in the week. Overall, EWMA techniques perform better than AR for our case of prediction.
Generally, comparing the accuracy of all the techniques, EWMA seems much better than AR. Among all the performed techniques, the smallest MAPE percentage for number of cases is observed for EWMA (alpha = 0.8) which is 305.34 % and for deaths is observed for AR(3) which is 23.57 %. The smallest MSE for number of cases is observed for EWMA (alpha=0.8) which is 573.8, while the smallest MSE for deaths is observed again for EWMA(alpha=0.8) which is 4.31. Comparing the above results, EWMA(alpha=0.8) seems much better than all the other techniques for this specific set of predictions.
The reason for such high MAPE and MSE for number of cases is the unusual actual numbers in San Francisco. The numbers are getting surprisingly low or are increasing with much slower rate than other places. Hence, the prediction will predict high values for number of cases and deaths, but the actual value is very low. For example, the number of cases for last 2 days is reported only as 6 and 1. For AR(3), the predictions are 45.97 and 35.07, which will give the percentage error as approx. 660 % and 3400 % respectively. This will skew the MAPE values.
from datetime import timedelta
# Filter death cases and compute death counts
death_cases = cases_df[cases_df['Case Disposition'] == 'Death']
death_counts = death_cases.groupby('Date')['Case Count'].sum().reset_index()
# Get the latest date in the data and compute last week and second last week date
latest_date = death_counts.sort_values(by='Date', ascending=False)['Date'].iloc[0]
last_week_date = latest_date - timedelta(weeks=1)
second_last_week_date = last_week_date - timedelta(weeks=1)
# Get death counts in last week imputing 0 value for missing dates and store in X
last_week_death_counts = death_counts[death_counts['Date'] > last_week_date]
X = []
for i in range(1, 8):
curr_date = (last_week_date + timedelta(days=i))
if curr_date not in set(last_week_death_counts['Date']):
X.append(0)
else:
X.append(last_week_death_counts[last_week_death_counts['Date'] == curr_date]['Case Count'].iloc[0])
X = np.array(X)
# Get death counts in second last week imputing 0 value for missing dates and store in Y
second_last_week_death_counts = death_counts.loc[(death_counts['Date'] > second_last_week_date) & (death_counts['Date'] <= last_week_date)]
Y = []
for i in range(1, 8):
curr_date = (second_last_week_date + timedelta(days=i))
if curr_date not in set(second_last_week_death_counts['Date']):
Y.append(0)
else:
Y.append(second_last_week_death_counts[second_last_week_death_counts['Date'] == curr_date]['Case Count'].iloc[0])
Y = np.array(Y)
For all the tests in required inference 2,
$\mu_x$: Mean of last week data and $\mu_y$: Mean of second last week data
Hypotheses:
$H_0: \mu = \mu_0$
$H_1: \mu \neq \mu_0$
$\mu$: Mean of last week data and $\mu_0$: Mean of second last week data
Assuming the original distribution is Poission($\lambda$), we get
$\hat\mu_{MLE} = \hat\lambda_{MLE} = \bar X$ and
$\hat se(\hat\mu) = \sqrt{Var(\bar X)} = \sqrt{\frac{Var(X)}{n}} = \sqrt{\frac{\hat\lambda_{MLE}}{n}} = \sqrt{\frac{\bar X}{n}}$
n = len(X)
mu_hat = np.mean(X)
mu_0 = np.mean(Y)
sample_variance = np.mean(X)
se_hat = np.sqrt(sample_variance / n)
w = np.abs((mu_hat - mu_0) / se_hat)
w
For $\alpha = 0.05$, $Z_{\alpha/2} = 1.96$
Since the Wald's statistic $|w| \leq Z_{\alpha/2}$ we fail to reject the null hypothesis $H_0$ and conclude that $\mu = \mu_0$
Comment: If given the true distribution is Poisson, Wald's test is applicable here, since we are using MLE estimator for mean of Poisson which is Asymtotically Normal. If we don't know the true distribution then Wald's test may not be applicable as we cannot compute the MLE estimator for mean.
Hypotheses:
$H_0: \mu_x = \mu_y$
$H_1: \mu_x \neq \mu_y$
Similar to one sample test, assuming the original distributions are Poisson($\lambda_x$) and Poisson($\lambda_y$) distributed, we get
$\hat \mu_x = \bar X$ , $\hat \mu_y = \bar Y$ and
$\hat se(\hat \mu_x) = \sqrt{\frac{\bar X}{n}}$ , $\hat se(\hat \mu_y) = \sqrt{\frac{\bar Y}{m}}$
n = len(X)
m = len(Y)
mu_x = np.mean(X)
mu_y = np.mean(Y)
sample_variance_x = np.mean(X)
sample_variance_y = np.mean(Y)
se_hat = np.sqrt(sample_variance_x / n + sample_variance_y / m)
w = np.abs((mu_x - mu_y) / se_hat)
w
For $\alpha = 0.05$, $Z_{\alpha/2} = 1.96$
Since the Wald's statistic $|w| \leq Z_{\alpha/2}$ we fail to reject the null hypothesis $H_0$ and conclude that $\mu_x = \mu_y$
Comment: Similar to one sample test, if given the true distribution is Poisson, Wald's test is applicable here, since we are using MLE estimator for mean of Poisson which is Asymtotically Normal. If we don't know the true distribution then Wald's test may not be applicable as we cannot compute the MLE estimator for mean.
Hypotheses:
$H_0: \mu = \mu_0$
$H_1: \mu \neq \mu_0$
n = len(X)
X_bar = np.mean(X)
mu_0 = np.mean(Y)
sample_variance = 1/n * np.sum(np.square(X - X_bar))
se_hat = np.sqrt(sample_variance / n)
t = np.abs((X_bar - mu_0) / se_hat)
t
For $\alpha = 0.05$ and $df = 6$, $t_{df,\alpha/2} = 2.447$
Since the T's statistic $|t| \leq t_{n-1, \alpha/2}$ we fail to reject the null hypothesis $H_0$ and conclude that $\mu = \mu_0$
Comment: T Test is only applicable when the distribution X is a Normal Distribution. Since we don't know the original distribution and can't assume it to be Normal, this test is not applicable here.
Hypotheses:
$H_0: \mu_x = \mu_y$
$H_1: \mu_x \neq \mu_y$
$D_i = X_i - Y_i$
D = X - Y
n = len(D)
D_bar = np.mean(D)
sample_variance_d = 1/n * np.sum(np.square(D - D_bar))
se_hat = np.sqrt(sample_variance_d / n)
t = np.abs(D_bar / se_hat)
t
For $\alpha = 0.05$ and $df = 6$, $t_{df,\alpha/2} = 2.447$
Since the T's statistic $|t| \leq t_{n-1, \alpha/2}$ we fail to reject the null hypothesis $H_0$ and conclude that $\mu_x = \mu_y$
Comment: Paired T Test is only applicable when the distribution D is a Normal Distribution. Since we don't know the original distributions X and Y, we can't assume (X - Y) to be Normal either and so this test is not applicable here.
Hypotheses:
$H_0: \mu_x = \mu_y$
$H_1: \mu_x \neq \mu_y$
$\bar D= \bar X - \bar Y$
n = len(X)
m = len(Y)
X_bar = np.mean(X)
Y_bar = np.mean(Y)
sample_variance_x = 1/n * np.sum(np.square(X - X_bar))
sample_variance_y = 1/m * np.sum(np.square(Y - Y_bar))
se_hat = np.sqrt(sample_variance_x / n + sample_variance_y / m)
t = np.abs((X_bar - Y_bar) / se_hat)
t
For $\alpha = 0.05$ and $df = n+m-2 = 12$, $t_{df,\alpha/2} = 2.179$
Since the T's statistic $|t| \leq t_{n+m-2, \alpha/2}$ we fail to reject the null hypothesis $H_0$ and conclude that $\mu_x = \mu_y$
Comment: Unpaired T Test is only applicable when the distributions X and Y are a Normal Distribution. Since we don't know the original distributions X and Y, we can't assume them to be Normal either and so this test is not applicable here.
Hypotheses:
$H_0: \mu = \mu_0$
$H_1: \mu \neq \mu_0$
# Get all the values for deaths into X_all
first_date = cases_df.sort_values(by='Date', ascending=True)['Date'].iloc[0]
last_date = cases_df.sort_values(by='Date', ascending=False)['Date'].iloc[0]
X_all = []
i = 0
while (first_date + timedelta(days=i)) <= last_date:
curr_date = (first_date + timedelta(days=i))
if curr_date not in set(death_counts['Date']):
X_all.append(0)
else:
X_all.append(death_counts[death_counts['Date'] == curr_date]['Case Count'].iloc[0])
i += 1
X_all = np.array(X_all)
# Compute true variance using sample variance of entire available data
X_all_bar = np.mean(X_all)
true_variance = 1/len(X_all) * np.sum(np.square(X_all - X_all_bar))
n = len(X)
X_bar = np.mean(X)
mu_0 = np.mean(Y)
se_hat = np.sqrt(true_variance / n)
z = np.abs((X_bar - mu_0) / se_hat)
z
For $\alpha = 0.05$, $Z_{\alpha/2} = 1.96$
Since the Z statistic $|z| \leq Z_{\alpha/2}$ we fail to reject the null hypothesis $H_0$ and conclude that $\mu = \mu_0$
Comment: Z Test is only applicable when n is large so that it follows Central Limit Theorem but since n is small here <30, Z test is not applicable here
All inferences for confirmed cases are similar to that of death cases in terminology and applicability
# Filter confirmed cases and compute cases counts
confirmed_cases = cases_df[cases_df['Case Disposition'] == 'Confirmed']
cases_counts = confirmed_cases.groupby('Date')['Case Count'].sum().reset_index()
# Get the latest date in the data and compute last week and second last week date
latest_date = cases_counts.sort_values(by='Date', ascending=False)['Date'].iloc[0]
last_week_date = latest_date - timedelta(weeks=1)
second_last_week_date = last_week_date - timedelta(weeks=1)
# Get cases counts in last week imputing 0 value for missing dates and store in X
last_week_cases_counts = cases_counts[cases_counts['Date'] > last_week_date]
X = []
for i in range(1, 8):
curr_date = (last_week_date + timedelta(days=i))
if curr_date not in set(last_week_cases_counts['Date']):
X.append(0)
else:
X.append(last_week_cases_counts[last_week_cases_counts['Date'] == curr_date]['Case Count'].iloc[0])
X = np.array(X)
# Get cases counts in second last week imputing 0 value for missing dates and store in Y
second_last_week_cases_counts = cases_counts.loc[(cases_counts['Date'] > second_last_week_date) & (cases_counts['Date'] <= last_week_date)]
Y = []
for i in range(1, 8):
curr_date = (second_last_week_date + timedelta(days=i))
if curr_date not in set(second_last_week_cases_counts['Date']):
Y.append(0)
else:
Y.append(second_last_week_cases_counts[second_last_week_cases_counts['Date'] == curr_date]['Case Count'].iloc[0])
Y = np.array(Y)
Hypotheses:
$H_0: \mu = \mu_0$
$H_1: \mu \neq \mu_0$
Assuming the original distribution is Poission($\lambda$), we get
$\hat\mu_{MLE} = \hat\lambda_{MLE} = \bar X$ and
$\hat se(\hat\mu) = \sqrt{Var(\bar X)} = \sqrt{\frac{Var(X)}{n}} = \sqrt{\frac{\hat\lambda_{MLE}}{n}} = \sqrt{\frac{\bar X}{n}}$
n = len(X)
mu_hat = np.mean(X)
mu_0 = np.mean(Y)
sample_variance = np.mean(X)
se_hat = np.sqrt(sample_variance / n)
w = np.abs((mu_hat - mu_0) / se_hat)
w
For $\alpha = 0.05$, $Z_{\alpha/2} = 1.96$
Since the Wald's statistic $|w| \leq Z_{\alpha/2}$ we fail to reject the null hypothesis $H_0$ and conclude that $\mu = \mu_0$
Comment: If given the true distribution is Poisson, Wald's test is applicable here, since we are using MLE estimator for mean of Poisson which is Asymtotically Normal. If we don't know the true distribution then Wald's test may not be applicable as we cannot compute the MLE estimator for mean.
Hypotheses:
$H_0: \mu_x = \mu_y$
$H_1: \mu_x \neq \mu_y$
Similar to one sample test, assuming the original distributions are Poisson($\lambda_x$) and Poisson($\lambda_y$) distributed, we get
$\hat \mu_x = \bar X$ , $\hat \mu_y = \bar Y$ and
$\hat se(\hat \mu_x) = \sqrt{\frac{\bar X}{n}}$ , $\hat se(\hat \mu_y) = \sqrt{\frac{\bar Y}{m}}$
n = len(X)
m = len(Y)
mu_x = np.mean(X)
mu_y = np.mean(Y)
sample_variance_x = np.mean(X)
sample_variance_y = np.mean(Y)
se_hat = np.sqrt(sample_variance_x / n + sample_variance_y / m)
w = np.abs((mu_x - mu_y) / se_hat)
w
For $\alpha = 0.05$, $Z_{\alpha/2} = 1.96$
Since the Wald's statistic $|w| \leq Z_{\alpha/2}$ we fail to reject the null hypothesis $H_0$ and conclude that $\mu_x = \mu_y$
Comment: Similar to one sample test, if given the true distribution is Poisson, Wald's test is applicable here, since we are using MLE estimator for mean of Poisson which is Asymtotically Normal. If we don't know the true distribution then Wald's test may not be applicable as we cannot compute the MLE estimator for mean.
Hypotheses:
$H_0: \mu = \mu_0$
$H_1: \mu \neq \mu_0$
n = len(X)
X_bar = np.mean(X)
mu_0 = np.mean(Y)
sample_variance = 1/n * np.sum(np.square(X - X_bar))
se_hat = np.sqrt(sample_variance / n)
t = np.abs((X_bar - mu_0) / se_hat)
t
For $\alpha = 0.05$ and $df = 6$, $t_{df,\alpha/2} = 2.447$
Since the T's statistic $|t| \leq t_{n-1, \alpha/2}$ we fail to reject the null hypothesis $H_0$ and conclude that $\mu = \mu_0$
Comment: T Test is only applicable when the distribution X is a Normal Distribution. Since we don't know the original distribution and can't assume it to be Normal, this test is not applicable here.
Hypotheses:
$H_0: \mu_x = \mu_y$
$H_1: \mu_x \neq \mu_y$
$D_i = X_i - Y_i$
D = X - Y
n = len(D)
D_bar = np.mean(D)
sample_variance_d = 1/n * np.sum(np.square(D - D_bar))
se_hat = np.sqrt(sample_variance_d / n)
t = np.abs(D_bar / se_hat)
t
For $\alpha = 0.05$ and $df = 6$, $t_{df,\alpha/2} = 2.447$
Since the T's statistic $|t| \leq t_{n-1, \alpha/2}$ we fail to reject the null hypothesis $H_0$ and conclude that $\mu_x = \mu_y$
Comment: Paired T Test is only applicable when the distribution D is a Normal Distribution. Since we don't know the original distributions X and Y, we can't assume (X - Y) to be Normal either and so this test is not applicable here.
Hypotheses:
$H_0: \mu_x = \mu_y$
$H_1: \mu_x \neq \mu_y$
$\bar D= \bar X - \bar Y$
n = len(X)
m = len(Y)
X_bar = np.mean(X)
Y_bar = np.mean(Y)
sample_variance_x = 1/n * np.sum(np.square(X - X_bar))
sample_variance_y = 1/m * np.sum(np.square(Y - Y_bar))
se_hat = np.sqrt(sample_variance_x / n + sample_variance_y / m)
t = np.abs((X_bar - Y_bar) / se_hat)
t
For $\alpha = 0.05$ and $df = n+m-2 = 12$, $t_{df,\alpha/2} = 2.179$
Since the T's statistic $|t| \leq t_{n+m-2, \alpha/2}$ we fail to reject the null hypothesis $H_0$ and conclude that $\mu_x = \mu_y$
Comment: Unpaired T Test is only applicable when the distributions X and Y are a Normal Distribution. Since we don't know the original distributions X and Y, we can't assume them to be Normal either and so this test is not applicable here.
Hypotheses:
$H_0: \mu = \mu_0$
$H_1: \mu \neq \mu_0$
# Get all the values for confirmed cases into X_all
first_date = cases_df.sort_values(by='Date', ascending=True)['Date'].iloc[0]
last_date = cases_df.sort_values(by='Date', ascending=False)['Date'].iloc[0]
X_all = []
i = 0
while (first_date + timedelta(days=i)) <= last_date:
curr_date = (first_date + timedelta(days=i))
if curr_date not in set(cases_counts['Date']):
X_all.append(0)
else:
X_all.append(cases_counts[cases_counts['Date'] == curr_date]['Case Count'].iloc[0])
i += 1
X_all = np.array(X_all)
# Compute true variance using sample variance of entire available data
X_all_bar = np.mean(X_all)
true_variance = 1/len(X_all) * np.sum(np.square(X_all - X_all_bar))
n = len(X)
X_bar = np.mean(X)
mu_0 = np.mean(Y)
se_hat = np.sqrt(true_variance / n)
z = np.abs((X_bar - mu_0) / se_hat)
z
For $\alpha = 0.05$, $Z_{\alpha/2} = 1.96$
Since the Z statistic $|z| \leq Z_{\alpha/2}$ we fail to reject the null hypothesis $H_0$ and conclude that $\mu = \mu_0$
Comment: Z Test is only applicable when n is large so that it follows Central Limit Theorem but since n is small here <30, Z test is not applicable here
def loadCovidDataset():
# cases_df has already processed data from the data pre-processing step
# It has already been cleared off Outliers
return cases_df
covidDatatset = loadCovidDataset()
print (covidDatatset.head())
We consider the second last week to range the period '2020-04-20' to '2020-04-26' (inclusive of both dates)
We consider the last week to range the period '2020-04-27' to '2020-05-03' (inclusive of both dates)
def preprocessCovidDataset(covidDatatset, startDate, endDate):
covidDatatset = covidDatatset.drop(['Transmission Category'], axis=1)
covidDatatset["Date"] = pd.to_datetime(covidDatatset["Date"], format="%Y/%m/%d")
covidDatatsetLast14Days = covidDatatset[(covidDatatset['Date'] >= startDate) & (covidDatatset['Date'] <= endDate)]
covidDatatsetLast14Days = covidDatatsetLast14Days.groupby(['Date','Case Disposition'], as_index=False)['Case Count'].sum()
return covidDatatsetLast14Days
def construct2DistributionsForCasesAndDeaths(covidDatatsetLast14Days, startDate, midDate, endDate):
sampleDistributions = {}
covidDatatsetLast14DaysCases= covidDatatsetLast14Days[(covidDatatsetLast14Days['Case Disposition'] == 'Confirmed')].drop(['Case Disposition'], axis=1)
covidDatatsetLast14DaysDeaths = covidDatatsetLast14Days[(covidDatatsetLast14Days['Case Disposition'] == 'Death')].drop(['Case Disposition'], axis=1)
sampleDistributions["covidDatatsetFirst7DaysCases"] = covidDatatsetLast14DaysCases[(covidDatatsetLast14DaysCases['Date'] >= startDate) & (covidDatatsetLast14DaysCases['Date'] < midDate)]['Case Count'].tolist()
sampleDistributions["covidDatatsetLast7DaysCases"] = covidDatatsetLast14DaysCases[(covidDatatsetLast14DaysCases['Date'] >= midDate) & (covidDatatsetLast14DaysCases['Date'] <= endDate)]['Case Count'].tolist()
sampleDistributions["covidDatatsetFirst7DaysDeaths"] = covidDatatsetLast14DaysDeaths[(covidDatatsetLast14DaysDeaths['Date'] >= startDate) & (covidDatatsetLast14DaysDeaths['Date'] < midDate)]['Case Count'].tolist()
sampleDistributions["covidDatatsetLast7DaysDeaths"] = covidDatatsetLast14DaysDeaths[(covidDatatsetLast14DaysDeaths['Date'] >= midDate) & (covidDatatsetLast14DaysDeaths['Date'] <= endDate)]['Case Count'].tolist()
return sampleDistributions
startDate = '2020-04-20'
midDate = '2020-04-27'
endDate = '2020-05-03'
covidDatatsetLast14Days = preprocessCovidDataset(covidDatatset, startDate, endDate)
sampleDistributions = construct2DistributionsForCasesAndDeaths(covidDatatsetLast14Days, startDate, midDate, endDate)
print ("Various Sample Distributions")
pprint.pprint (sampleDistributions)
Second Last Week #Cases Data
Second Last Week #Death Data
Last Week #Cases Data
Last Week #Deaths Data
def calculateMean(weekCount):
return mean(weekCount)
def calculateVarience(weekCount):
m = calculateMean(weekCount)
return sum([(xi - m)** 2 for xi in weekCount]) / len(weekCount)
def calculateBinomialN(weekCount):
meanValue = mean(weekCount)
return (meanValue**2 ) / (meanValue - calculateVarience(weekCount))
def calculateBinomialP(weekCount):
return (1 - (calculateVarience(weekCount)/ calculateMean(weekCount)))
def calculateParameterForReferenceDistributions(sampleDistributions):
referenceDistributions = {}
# Calculate Poisson Distribution Parameters
referenceDistributions["poisson"] = {}
referenceDistributions["poisson"]["case"] = {}
referenceDistributions["poisson"]["case"]["type"] = "poisson"
referenceDistributions["poisson"]["case"]["label"] = "Last Week #Cases"
referenceDistributions["poisson"]["case"]["lambda"] = calculateMean(sampleDistributions["covidDatatsetFirst7DaysCases"])
referenceDistributions["poisson"]["death"] = {}
referenceDistributions["poisson"]["death"]["type"] = "poisson"
referenceDistributions["poisson"]["death"]["label"] = "Last Week #Deaths"
referenceDistributions["poisson"]["death"]["lambda"] = calculateMean(sampleDistributions["covidDatatsetFirst7DaysDeaths"])
referenceDistributions["geometric"] = {}
referenceDistributions["geometric"]["case"] = {}
# Calculate Geometric Distribution Parameters
referenceDistributions["geometric"]["case"]["type"] = "geometric"
referenceDistributions["geometric"]["case"]["label"] = "Last Week #Cases"
referenceDistributions["geometric"]["case"]["p"] = 1 / calculateMean(sampleDistributions["covidDatatsetFirst7DaysCases"])
referenceDistributions["geometric"]["death"] = {}
referenceDistributions["geometric"]["death"]["type"] = "geometric"
referenceDistributions["geometric"]["death"]["label"] = "Last Week #Deaths"
referenceDistributions["geometric"]["death"]["p"] = 1 / calculateMean(sampleDistributions["covidDatatsetFirst7DaysDeaths"])
# Calculate Binomial Distribution Parameters
referenceDistributions["binomial"] = {}
referenceDistributions["binomial"]["case"] = {}
referenceDistributions["binomial"]["case"]["type"] = "binomial"
referenceDistributions["binomial"]["case"]["label"] = "Last Week #Cases"
referenceDistributions["binomial"]["case"]["n"] = round(calculateBinomialN(sampleDistributions["covidDatatsetFirst7DaysCases"]))
referenceDistributions["binomial"]["case"]["p"] = calculateBinomialP(sampleDistributions["covidDatatsetFirst7DaysCases"])
referenceDistributions["binomial"]["death"] = {}
referenceDistributions["binomial"]["death"]["type"] = "binomial"
referenceDistributions["binomial"]["death"]["label"] = "Last Week #Deaths"
referenceDistributions["binomial"]["death"]["n"] = round(calculateBinomialN(sampleDistributions["covidDatatsetFirst7DaysDeaths"]))
referenceDistributions["binomial"]["death"]["p"] = calculateBinomialP(sampleDistributions["covidDatatsetFirst7DaysDeaths"])
return referenceDistributions
referenceDistributions = calculateParameterForReferenceDistributions(sampleDistributions)
print ("Estimated Parameters for Possion, Geometric and Binomial Distributions based on Second Last Week for #Cases and #Deaths Results")
pprint.pprint (referenceDistributions)
print ("MME Estimate for paramter 'n' and 'p' of Binomial Distribution using Second Last Week #Cases is Negative. Hence we won't use this any further as the data clearly doesnt fit the Distribution")
print ("MME Estimate for paramter 'n' of Binomial Distribution : ", referenceDistributions["binomial"]["case"]["n"])
print ("MME Estimate for paramter 'p' of Binomial Distribution : ", referenceDistributions["binomial"]["case"]["p"])
MME sucessfully estimates parameters for each of the distribution
Poisson Distribution Parameters calculated based on Second Last Week #Cases :
Poisson Distribution Parameters calculated based on Second Last Week #Deaths :
Geometric Distribution Parameters calculated based on Second Last Week #Cases :
Geometric Distribution Parameters calculated based on Second Last Week #Deaths :
Binomial Distribution Parameters calculated based on Second Last Week #Cases :
Binomial Distribution Parameters calculated based on Second Last Week #Deaths :
Hypotheses:
$H_0:$ Sample Data is drawn from the Reference Distribution
$H_1:$ Sample Data is not drawn from the Reference Distribution
The KS Test (1 Sample) is used to check weather a sample is drawn from the Reference Distribution or not. We consider the Critical Threshold to be 0.05. If the Maximum Difference obtained by performing the KS Test is < Critical Threshold then we cannot reject $H_0:$
def calculateProbability(elements):
elements = sorted(elements)
lengthweekCount = len(elements)
probabiltyForACount = 1 / lengthweekCount
weekInformation = collections.OrderedDict()
for dailyCount in elements:
if dailyCount in weekInformation:
weekInformation[dailyCount]["probability"] += probabiltyForACount
else:
weekInformation[dailyCount] = {}
weekInformation[dailyCount]["probability"] = probabiltyForACount
return weekInformation
def calculateWhenPoissonIsReference(x, referenceDistribution):
fy = 0
for i in range(0,x+1):
fy += (referenceDistribution["lambda"]**i) / (math.factorial(i))
fy = math.exp(-referenceDistribution["lambda"]) * fy
return fy
def calculateWhenGeometricIsReference(x, referenceDistribution):
fy = 1 - (1 - referenceDistribution["p"])**x
return fy
def calculateWhenBinomialIsReference(x, referenceDistribution):
fy = 0
if x > referenceDistribution["n"]:
return 1
for i in range(0,x+1):
nCr = (math.factorial(referenceDistribution["n"]) // math.factorial(i) // math.factorial(referenceDistribution["n"] - i))
fy += nCr * referenceDistribution["p"]**i * (1 - referenceDistribution["p"])**(referenceDistribution["n"] - i)
return fy
def calculate1SampleFy(x, referenceDistribution):
if referenceDistribution["type"] == "poisson":
return calculateWhenPoissonIsReference(x, referenceDistribution)
elif referenceDistribution["type"] == "geometric":
return calculateWhenGeometricIsReference(x, referenceDistribution)
else:
return calculateWhenBinomialIsReference(x, referenceDistribution)
def calculateTermsFor1SampleKSTest(sampleDistribution, referenceDistribution):
weekInformation = calculateProbability(sampleDistribution)
cummulativeProbability = 0
difference = []
for key, value in weekInformation.items():
weekInformation[key]["negativeFx"] = cummulativeProbability
cummulativeProbability += value["probability"]
weekInformation[key]["cummulativeProbability"] = cummulativeProbability
weekInformation[key]["positiveFx"] = cummulativeProbability
weekInformation[key]["Fy"] = calculate1SampleFy(key, referenceDistribution)
postiveDifference = abs(weekInformation[key]["negativeFx"] - weekInformation[key]["Fy"])
negativeDifference = abs(weekInformation[key]["positiveFx"] - weekInformation[key]["Fy"])
difference.append(postiveDifference)
difference.append(negativeDifference)
maxDifference = max(difference)
conclusion = "We REJECT the Null Hypothesis"
criticalValue = 0.05
if maxDifference < criticalValue:
conclusion = "We CANNOT Reject the Null Hypothesis"
print ("Sample Data: ", referenceDistribution["label"], "\nReference Distribution : ", referenceDistribution["type"], "\nMaximum Difference : ", maxDifference, "\nColusion : ", conclusion, "\n")
def perform1SampleKSTests(sampleDistributions, referenceDistributions):
calculateTermsFor1SampleKSTest(sampleDistributions["covidDatatsetLast7DaysCases"], referenceDistributions["poisson"]["case"])
calculateTermsFor1SampleKSTest(sampleDistributions["covidDatatsetLast7DaysDeaths"], referenceDistributions["poisson"]["death"])
calculateTermsFor1SampleKSTest(sampleDistributions["covidDatatsetLast7DaysCases"], referenceDistributions["geometric"]["case"])
calculateTermsFor1SampleKSTest(sampleDistributions["covidDatatsetLast7DaysDeaths"], referenceDistributions["geometric"]["death"])
calculateTermsFor1SampleKSTest(sampleDistributions["covidDatatsetLast7DaysDeaths"], referenceDistributions["binomial"]["death"])
perform1SampleKSTests(sampleDistributions, referenceDistributions)
Sample Data - Last Week #Cases Vs Reference Distribution - Poisson :
Sample Data - Last Week #Deaths Vs Reference Distribution - Poisson :
Sample Data - Last Week #Cases Vs Reference Distribution - Geometric :
Sample Data - Last Week #Deaths Vs Reference Distribution - Geometric :
Sample Data - Last Week #Deaths Vs Reference Distribution - Binomial :
Hypotheses:
$H_0:$ Samples (Sample Data 1 and Sample Data 2) are drawn from the same Distribution
$H_1:$ Samples (Sample Data 1 and Sample Data 2) are not drawn from the same Distribution
The KS Test (1 Sample) is used to check weather a samples (Sample Data 1 and Sample Data 2 here) are drawn from the same Distribution or not. We consider the Critical Threshold to be 0.05. If the Maximum Difference obtained by performing the KS Test is < Critical Threshold then we cannot reject $H_0:$
def calculateDifferenceWhenChangePointIsSame(day, firstWeekInformation, lastWeekInformation):
difference = []
lastWeekPositive = lastWeekInformation[day]["cummulativeProbability"]
currentIndex = list(lastWeekInformation).index(day)
if currentIndex == 0:
lastWeekNegative = 0
else:
items = list(lastWeekInformation.items())
previousItem = items[currentIndex-1]
lastWeekNegative = previousItem[1]["cummulativeProbability"]
firstWeekPositive = firstWeekInformation[day]["cummulativeProbability"]
currentIndex = list(firstWeekInformation).index(day)
if currentIndex == 0:
firstWeekNegative = 0
else:
items = list(firstWeekInformation.items())
previousItem = items[currentIndex-1]
firstWeekNegative = previousItem[1]["cummulativeProbability"]
difference.append(abs(lastWeekPositive - firstWeekPositive))
difference.append(abs(lastWeekNegative - firstWeekNegative))
return difference
def calculateTermsFor2SampleKSTest(firstWeek, lastWeek, label):
if len(lastWeek) > len(firstWeek):
firstWeek, lastWeek = lastWeek, firstWeek
difference = []
firstWeekInformation = calculateProbability(firstWeek)
lastWeekInformation = calculateProbability(lastWeek)
cummulativeProbability = 0
for key, value in firstWeekInformation.items():
cummulativeProbability += value["probability"]
firstWeekInformation[key]["cummulativeProbability"] = cummulativeProbability
cummulativeProbability = 0
for key, value in lastWeekInformation.items():
lastWeekInformation[key]["negativeFy"] = cummulativeProbability
cummulativeProbability += value["probability"]
lastWeekInformation[key]["cummulativeProbability"] = cummulativeProbability
lastWeekInformation[key]["positiveFy"] = cummulativeProbability
for day in lastWeek:
flag = True
if day in firstWeek:
difference += calculateDifferenceWhenChangePointIsSame(day, firstWeekInformation, lastWeekInformation)
else:
for key, value in reversed(firstWeekInformation.items()):
if day > key:
lastWeekInformation[day]["Fx"] = firstWeekInformation[key]["cummulativeProbability"]
flag = False
break
if flag:
lastWeekInformation[day]["Fx"] = 0
for key, value in lastWeekInformation.items():
if key not in firstWeek:
postiveDifference = abs(lastWeekInformation[key]["negativeFy"] - lastWeekInformation[key]["Fx"])
negativeDifference = abs(lastWeekInformation[key]["positiveFy"] - lastWeekInformation[key]["Fx"])
difference.append(postiveDifference)
difference.append(negativeDifference)
maxDifference = max(difference)
conclusion = "We REJECT the Null Hypothesis"
criticalValue = 0.05
if maxDifference < criticalValue:
conclusion = "We CANNOT Reject the Null Hypothesis"
print (label, "\nMaximum Difference : ", maxDifference, "\nColusion : ", conclusion, "\n")
def perform2SampleKSTests(sampleDistributions):
label = "Sample Data 1 : Second Last Week #Cases\nSample Data 2 : Last Week #Cases "
calculateTermsFor2SampleKSTest(sampleDistributions["covidDatatsetFirst7DaysCases"], sampleDistributions["covidDatatsetLast7DaysCases"], label)
label = "Sample Data 1 : Second Last Week #Deaths\nSample Data 2 : Last Week #Deaths "
calculateTermsFor2SampleKSTest(sampleDistributions["covidDatatsetFirst7DaysDeaths"], sampleDistributions["covidDatatsetLast7DaysDeaths"], label)
perform2SampleKSTests(sampleDistributions)
Sample Data 1 - Second Last Week #Cases Vs Sample Data 2 - Last Week #Cases :
Sample Data 1 - Second Last Week #Death Vs Sample Data 2 - Last Week #Death :
Hypotheses:
$H_0:$ Samples (Sample Data 1 and Sample Data 2) are drawn from the same Distribution
$H_1:$ Samples (Sample Data 1 and Sample Data 2) are not drawn from the same Distribution
The Permutation Test is used to check weather samples (Sample Data 1 and Sample Data 2 here) are drawn from the same Distribution or not. We consider the Alpha Value to be 0.05. If the p-value obtained by performing the Permutation is < Alpha Value then we reject $H_0:$
NOTE : To Reduce Computation we cap the number total number of permutations to a maximum of 350,000
def performPermutationTest(firstWeek, lastWeek, label, numberOfPermutationToTake = 350000):
lengthFirstWeek = len(firstWeek)
lengthLastWeek = len(lastWeek)
n = lengthFirstWeek + lengthLastWeek
meanFirstWeek = calculateMean(firstWeek)
meanLastWeek = calculateMean(lastWeek)
combinedWeek = firstWeek + lastWeek
tObs = abs(meanFirstWeek - meanLastWeek)
permutationList = list(itertools.islice(itertools.permutations(combinedWeek, n), numberOfPermutationToTake))
countTiGreaterThanTobs = 0
for row in permutationList:
data = {}
data["x"] = row[0:lengthFirstWeek]
data["y"] = row[lengthFirstWeek: n]
meanX = calculateMean(data["x"])
meanY = calculateMean(data["y"])
data["ti"] = abs(meanX - meanY)
if data["ti"] > tObs:
countTiGreaterThanTobs += 1
actualPermuationNumber = math.factorial(n)
if numberOfPermutationToTake > actualPermuationNumber:
numberOfPermutationToTake = actualPermuationNumber
pvalue = countTiGreaterThanTobs / numberOfPermutationToTake
conclusion = "We CANNOT Reject the Null Hypothesis"
if pvalue < 0.05:
conclusion = "We REJECT the Null Hypothesis"
print (label, "\nActual Number of Permutations we should do : ", actualPermuationNumber, "\nFor computation reasons we cap the total number of Permuatations to : ", numberOfPermutationToTake, "\nP Value : ", pvalue, "\nConclusion : ", conclusion, "\n")
def performPermutationTests(sampleDistributions):
label = "Sample Data 1 : Second Last Week #Cases\nSample Data 2 : Last Week #Cases "
performPermutationTest(sampleDistributions["covidDatatsetFirst7DaysCases"], sampleDistributions["covidDatatsetLast7DaysCases"], label)
label = "Sample Data 1 : Second Last Week #Deaths\nSample Data 2 : Last Week #Deaths "
performPermutationTest(sampleDistributions["covidDatatsetFirst7DaysDeaths"], sampleDistributions["covidDatatsetLast7DaysDeaths"], label)
performPermutationTests(sampleDistributions)
Sample Data 1 - Second Last Week #Cases Vs Sample Data 2 - Last Week #Cases :
Sample Data 1 - Second Last Week #Deaths Vs Sample Data 2 - Last Week #Deaths :
One month that has been selected is from March 6th 2020 to April 5th 2020.
cases_tot = cases_df[["Date","Case Count"]].groupby(["Date"]).sum().reset_index()
cases_tot.columns
start_date = "2020-03-06"
end_date = "2020-04-05"
mask = (cases_tot['Date'] >= start_date) & (cases_tot['Date'] <= end_date)
total_cases = cases_tot.loc[mask]
idx = pd.date_range('03-06-2020', '04-05-2020')
total_cases = total_cases.set_index("Date")
total_cases.index = pd.DatetimeIndex(total_cases.index)
total_cases = total_cases.reindex(idx, fill_value=0)
total_cases["Date"] = total_cases.index
total_cases.reset_index
total_cases.head()
death_tot = cases_df.loc[cases_df['Case Disposition'] == "Death"][["Date","Case Count"]].groupby(["Date"]).sum().reset_index()
mask = (death_tot['Date'] > start_date) & (death_tot['Date'] <= end_date)
death_tot = death_tot.loc[mask]
idx = pd.date_range('03-06-2020', '04-05-2020')
death_tot = death_tot.set_index("Date")
death_tot.index = pd.DatetimeIndex(death_tot.index)
death_tot = death_tot.reindex(idx, fill_value=0)
death_tot["Date"] = death_tot.index
death_tot.reset_index(drop=True)
death_tot.sample(5)
unique_first = crime_df[(crime_df["Report Type Description"] == 'Initial') |
(crime_df["Report Type Description"] == 'Initial Supplement') |
(crime_df["Report Type Description"] == 'Vehicle Initial') |
(crime_df["Report Type Description"] == 'Coplogic Initial')]
crime_tot = unique_first[["Incident Date"]].groupby(["Incident Date"])["Incident Date"].count()
date_df = pd.DataFrame()
date_df['Incident Date']=crime_tot.index
date_df['Num_crimes']=crime_tot.values
mask = (date_df['Incident Date'] >= start_date) & (date_df['Incident Date'] <= end_date)
date_df = date_df.loc[mask]
date_df.reset_index(drop=True).head()
def cal_correlation(x,y):
std_x = np.std(x)
std_y = np.std(y)
mean_x = np.mean(x)
mean_y = np.mean(y)
tot = np.multiply(np.subtract(x,mean_x),np.subtract(y,mean_y))
num = np.mean(tot)
return num/(std_x*std_y)
cal_correlation(date_df['Num_crimes'].tolist(),total_cases['Case Count'].tolist())
We infer that crime rate and Corona cases are negetively correlated. This makes sense because as the number of corona cases go up, we expect the crime rate to go down.
cal_correlation(date_df['Num_crimes'].tolist(),death_tot['Case Count'].tolist())
Here again they are negetively correlated, But its not as strong as the cases. Deaths are more sporadic and thats probably why there is a weaker correlation when compared to total cases.
import math
from scipy.stats import gamma
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
sorted_df = cases_df.sort_values('Date')
print(sorted_df)
confirmed_Cases = sorted_df[sorted_df['Case Disposition'] == 'Confirmed']
confirmed_cases = confirmed_Cases.groupby('Date')['Case Count'].sum()
ccases = confirmed_cases[confirmed_cases.shape[0] - 28:] ##considering last 28 days
img=mpimg.imread(root_path + '/data/project5inf.jpg')
plt.figure(figsize=(10,10))
plt.axis('off')
plt.imshow(img)
# Finds the MAP estimation for Posterior
def find_map(x, alpha, scale_):
max = 0
lambda_ = 0
for i in x:
f_lambda = gamma.pdf(i, alpha, scale=scale_)
if f_lambda > max:
max = f_lambda
lambda_ = i
return lambda_
colors = ['-g', '-r', '-b', '-y']
legends = ['First Round', 'Second Round', 'Third Round', 'Fourth Round']
lamda_mme = sum(ccases[0:7])/7
beta = lamda_mme
fig, ax = plt.subplots(figsize=(20, 10), nrows=1, ncols=1)
x = list(range(1,100))
# Plotting pdf for posterior distribution
for i in range(1, 5):
n = 7
alpha = sum(ccases[0:7 * i]) + 1
g_beta = n*i + 1/beta
ax.plot(x, gamma.pdf(x, alpha, scale=1/g_beta), colors[i - 1], lw=3, alpha=0.8)
ax.legend(legends)
ax.set_xlabel('Lambda')
ax.set_ylabel('f(Lambda|data) : PDF')
map = find_map(x, alpha, 1/g_beta)
print('MAP Estimation for {} is {}'.format(legends[i-1], map))
San francisco increased it's testing facilities and equipments on April 22 [News Source]https://www.nbcbayarea.com/news/local/san-francisco/sf-increases-testing-for-all-essential-workers-residents-with-covid-19-symptoms/2277523/ We expect a serious rise in cases after that event We check whether weakly mean of cases after the news is greater than weakly mean cases before the event $H_0: \hat\mu \geq \mu_0$ $H_1: \hat\mu < \mu_0$
We know that regardless of the original distribution of X, we know by Central Limit Theorem, that $\overline{X} \sim N(\mu, \dfrac{\sigma^2}{n})$ where $\mu$ is true mean and $\sigma^2$ is true standard deviation
Hence, Z-test is applicable here
confirm_cases = cases_df[cases_df["Case Disposition"] == "Confirmed"]
case_counts = confirm_cases.groupby('Date')['Case Count'].sum().reset_index()
# lockdown_effective_date = datetime.datetime(2020, 3, 17)
effective_date = datetime.datetime(2020, 4, 22)
# effective_date = datetime.datetime(2020, 3, 17)
before_start = effective_date - datetime.timedelta(days=7)
before_end = before_start + datetime.timedelta(days=6)
after_start = effective_date + datetime.timedelta(days=1)
after_end = after_start + datetime.timedelta(days=6)
mean_before = case_counts[(case_counts["Date"] >= before_start) & (case_counts["Date"] <= before_end)]["Case Count"].mean()
mean_after = case_counts[(case_counts["Date"] >= after_start) & (case_counts["Date"] <= after_end)]["Case Count"].mean()
X = case_counts["Case Count"].astype(float).to_numpy()
e_of_x = np.mean(X)
e_of_x_square = np.mean(np.multiply(X, X))
true_standard_deviation_of_X = math.sqrt(e_of_x_square - (e_of_x**2))
z_statistic = (mean_after - mean_before)/(true_standard_deviation_of_X / math.sqrt(7))
z_statistic
Since this is a left tailed test we are going to check whether z-statistic $ \leq -Z_{\alpha}$ to reject the null hypothesis i.e. z-statistic $\leq -1.645$ for significance level of 0.05
z_threshold = -1.645 # for significance of 0.05
if z_statistic < z_threshold:
print("We reject the null hypothesis, there is no effect of new testing facilities")
else:
print("We failed to reject null hypothesis, there is serious rise in new cases as new testing facilities or equipments increase")
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import math
import matplotlib.patches as mpatches
figures, axes = plt.subplots(figsize=(7, 5))
mu = 0
variance = 1
sigma = math.sqrt(variance)
x = np.linspace(mu - 3*sigma, mu + 3*sigma, 100)
axes.plot(x, stats.norm.pdf(x, mu, sigma))
x_till_statistic = np.arange(-3, z_statistic, 0.001)
y_till_statistic = stats.norm.pdf(x_till_statistic, mu, sigma)
axes.fill_between(x_till_statistic, y_till_statistic,0, alpha=0.5, color='b')
axes.axvline(x=z_threshold, linestyle='--', color='r', gid='threshold')
red_patch = mpatches.Patch(color='red', label='Threshold at 0.05')
blue_patch = mpatches.Patch(color='blue', alpha=0.5, label='p_value area')
axes.legend(handles=[red_patch, blue_patch])
axes.set_title("Z-test plot on standard normal")
plt.show()
$H_0: Gender \perp \!\!\! \perp Corona$ $H_1: Gender$ x $\perp \!\!\! \perp Corona$ (Not independent)
Gender Corona data: https://www.opendatanetwork.com/entity/1600000US0667000/San_Francisco_CA/demographics.population.count?year=2018
Case Corona date: https://worldpopulationreview.com/us-cities/san-francisco-population/
gender_data = pd.read_csv(root_path + '/data/COVID-19_Cases_Summarized_by_Age_Group_and_Gender.csv', index_col=0)
gender_data["Confirmed Cases"] = gender_data["Confirmed Cases"].astype(int)
gender_cases = gender_data.groupby(["Gender"]).agg({"Confirmed Cases": "sum"}).reset_index()
male_cases = gender_cases[gender_cases["Gender"] == "Male"].iloc[0]['Confirmed Cases']
female_cases = gender_cases[gender_cases["Gender"] == "Female"].iloc[0]['Confirmed Cases']
total_female = 426074
total_male = 443970
non_corona_female = total_female - female_cases
non_corona_male = total_male - male_cases
observed_matrix = [[female_cases, non_corona_female], [male_cases, non_corona_male]]
total = float(total_male + total_female)
q_statistic = 0.0
total_row = [sum(observed_matrix[0]), sum(observed_matrix[1])]
total_col = [male_cases + female_cases, non_corona_male + non_corona_female]
for i in range(2):
for j in range(2):
expected = float(total_row[i]) * float(total_col[j]) / total
observed = observed_matrix[i][j]
q_statistic = q_statistic + (((expected - observed) ** 2) / expected)
degrees_of_freedom = (2 - 1) * (2 - 1)
print("Q statistic observed: " + str(q_statistic) + " Degrees of Freedom: " + str(degrees_of_freedom))
Only for fetching p-value from Chi-Square distribution we are using scipy library
from scipy import stats
p_value = 1 - stats.chi2.cdf(q_statistic, degrees_of_freedom)
print("P-value observed against q_statistic: " + str(p_value))
if p_value < 0.05:
print("Gender is independent of Corona cases, it impacts both the genders almost equally")
else:
print("Gender is dependent of Corona cases, it may not impact both the genders almost equally")
Hence, we conclude that the number of positive and negative cases for corona are independent of gender i.e. the effect of corona on the basis of gender is more or less same for both the genders.
Below is the Plot for chi-square Observance Matrix
df = pd.DataFrame({"Corona Positive": [observed_matrix[i][0] for i in range(2)], "Corona Negative": [observed_matrix[i][1] for i in range(2)]}, index=["female", "male"])
sns.heatmap(df, annot=True, fmt=".01f")
Has the lockdown affected the way corona is transmitted ?
We want to infer if the lockdown has affected the way corona is transitted i.e by contact or by community. So we perform Unpaired T-Test for Each of these transmission types for dates before lockdown and after lockdown. Two Dates Ranges are taken, (2020-03-05 - 2020-03-27) and (2020-04-15 - 2020-05-08). The first range corresponds to the dates before lockdown and the second dates are when lockdown is in effect. Lockdown was introduced on 2020-03-17, we add a 10 day buffer to account for people who contracted corona earlier but did not show symptoms
Same notations for before and after lockdown
Hypotheses:
$H_0: \mu_x = \mu_y$
$H_1: \mu_x \neq \mu_y$
$\bar D= \bar X - \bar Y$
$\mu_x$ is mean of the community transmissions
$\mu_y$ is mean of transmission by contact
def loadCovidDataset(path):
# cases_df has already been preprocessed in pre-processing phase and Outliers have been removed
return cases_df
def preprocessCovidDataset(covidDatatset, startDate, midDate1, midDate2, endDate):
covidDatatset["Date"] = pd.to_datetime(covidDatatset["Date"], format="%Y/%m/%d")
covidDatatsetFiltered = covidDatatset[((covidDatatset['Date'] >= startDate) & (covidDatatset['Date'] <= midDate1)) | ((covidDatatset['Date'] >= midDate2) & (covidDatatset['Date'] <= endDate)) ]
covidDatatsetFiltered = covidDatatsetFiltered.groupby(['Date','Transmission Category','Case Disposition'], as_index=False)['Case Count'].sum()
return covidDatatsetFiltered
def construct2DistributionsForCasesAndDeaths(covidDatatsetFiltered, startDate, midDate1, midDate2, endDate):
sampleDistributions = {}
covidDatatsetCases = covidDatatsetFiltered[(covidDatatsetFiltered['Case Disposition'] == 'Confirmed')].drop(['Case Disposition'], axis=1)
covidDatatsetCommunityCases = covidDatatsetCases[(covidDatatsetCases['Transmission Category'] == 'Community')].drop(['Transmission Category'], axis=1)
covidDatatsetContactCases = covidDatatsetCases[(covidDatatsetCases['Transmission Category'] == 'From Contact')].drop(['Transmission Category'], axis=1)
covidDatatsetUnknownCases = covidDatatsetCases[(covidDatatsetCases['Transmission Category'] == 'Unknown')].drop(['Transmission Category'], axis=1)
sampleDistributions["covidDatatsetCommunityCasesFirstHalf"] = covidDatatsetCommunityCases[(covidDatatsetCommunityCases['Date'] >= startDate) & (covidDatatsetCommunityCases['Date'] <= midDate1)]['Case Count'].tolist()
sampleDistributions["covidDatatsetContactCasesFirstHalf"] = covidDatatsetContactCases[(covidDatatsetContactCases['Date'] >= startDate) & (covidDatatsetContactCases['Date'] <= midDate1)]['Case Count'].tolist()
sampleDistributions["covidDatatsetUnknownCasesFirstHalf"] = covidDatatsetUnknownCases[(covidDatatsetUnknownCases['Date'] >= startDate) & (covidDatatsetUnknownCases['Date'] <= midDate1)]['Case Count'].tolist()
sampleDistributions["covidDatatsetCommunityCasesSecondHalf"] = covidDatatsetCommunityCases[(covidDatatsetCommunityCases['Date'] >= midDate2) & (covidDatatsetCommunityCases['Date'] <= endDate)]['Case Count'].tolist()
sampleDistributions["covidDatatsetContactCasesSecondHalf"] = covidDatatsetContactCases[(covidDatatsetContactCases['Date'] >= midDate2) & (covidDatatsetContactCases['Date'] <= endDate)]['Case Count'].tolist()
sampleDistributions["covidDatatsetUnknownCasesSecondHalf"] = covidDatatsetUnknownCases[(covidDatatsetUnknownCases['Date'] >= midDate2) & (covidDatatsetUnknownCases['Date'] <= endDate)]['Case Count'].tolist()
return sampleDistributions
def performTtest(X, Y):
n = len(X)
m = len(Y)
X_bar = np.mean(X)
Y_bar = np.mean(Y)
sample_variance_x = 1/n * np.sum(np.square(X - X_bar))
sample_variance_y = 1/m * np.sum(np.square(Y - Y_bar))
se_hat = np.sqrt(sample_variance_x / n + sample_variance_y / m)
t = np.abs((X_bar - Y_bar) / se_hat)
return t
def performTtests(sampleDistributions):
print("T-Value for Community vs Contact before lockdown ",performTtest(sampleDistributions["covidDatatsetCommunityCasesFirstHalf"],sampleDistributions["covidDatatsetContactCasesFirstHalf"]))
print("T-Value for Community vs Contact after lockdown ",performTtest(sampleDistributions["covidDatatsetCommunityCasesSecondHalf"],sampleDistributions["covidDatatsetContactCasesSecondHalf"]))
startDate = '2020-03-05'
midDate1 = '2020-03-27'
midDate2 = '2020-04-15'
endDate = '2020-05-08'
path = '/data/COVID-19_Cases_Summarized_by_Date__Transmission_and_Case_Disposition.csv'
covidDatatset = loadCovidDataset(path)
covidDatatsetFiltered = preprocessCovidDataset(covidDatatset, startDate, midDate1, midDate2, endDate)
sampleDistributions = construct2DistributionsForCasesAndDeaths(covidDatatsetFiltered, startDate, midDate1, midDate2, endDate)
performTtests(sampleDistributions)
For T-Value for Community vs Contact before lockdown 4.06467
For $\alpha = 0.05$ and $df = n+m-2 = 32$, $t_{df,\alpha/2} = 2.037$
Since the T's statistic $|t| \gt t_{n+m-2, \alpha/2}$ we accept the null hypothesis $H_0$ and conclude that $\mu_x \ne \mu_y$
T-Value for Community vs Contact after lockdown 1.833
For $\alpha = 0.05$ and $df = n+m-2 = 35$, $t_{df,\alpha/2} = 2.030$
Since the T's statistic $|t| \leq t_{n+m-2, \alpha/2}$ we fail to reject the null hypothesis $H_0$ and conclude that $\mu_x = \mu_y$
Conclusion:
Before lockdown the means are not the same, community seems to be higher as observed from the T-value.
But after lockdown the trasnmission by community and contact becomes similar.