import matplotlib.pyplot as plt
import pandas as pd
import pytask
import seaborn as sns
import yaml
from sid.colors import get_colors
from src.config import BLD
from src.config import PLOT_END_DATE
from src.config import PLOT_SIZE
from src.config import PLOT_START_DATE
from src.config import POPULATION_GERMANY
from src.plotting.plotting import style_plot
plt.rcParams.update(
{
"axes.spines.right": False,
"axes.spines.top": False,
"legend.frameon": False,
}
)
@pytask.mark.depends_on(
{
"data": BLD / "data" / "raw_time_series" / "vaccinations.xlsx",
}
)
@pytask.mark.produces(
{
"vaccination_shares_raw": BLD
/ "data"
/ "vaccinations"
/ "vaccination_shares_raw.pkl",
"vaccination_shares_extended": BLD
/ "data"
/ "vaccinations"
/ "vaccination_shares_extended.pkl",
"fig_first_dose": BLD
/ "figures"
/ "data"
/ "share_of_individuals_with_first_vaccine.pdf",
"fig_vaccination_shares": BLD
/ "figures"
/ "data"
/ "share_receiving_vaccination_per_day.pdf",
"mean_vacc_share_per_day": BLD
/ "data"
[docs] / "vaccinations"
/ "mean_vacc_share_per_day.yaml",
}
)
def task_prepare_vaccination_data(depends_on, produces):
df = pd.read_excel(depends_on["data"], sheet_name="Impfungen_proTag")
df = _clean_vaccination_data(df)
# this is for comparing with newspaper sites
fig, ax = _plot_series(df["share_with_first_dose"], "")
ax.set_xlim(pd.Timestamp(PLOT_START_DATE), pd.Timestamp(PLOT_END_DATE))
fig.savefig(produces["fig_first_dose"])
plt.close()
vaccination_shares = df["share_with_first_dose"].diff().dropna()
vaccination_shares.to_pickle(produces["vaccination_shares_raw"])
# extend data to 2020.
backward_dates = pd.date_range("2020-01-01", vaccination_shares.index.max())
vaccination_shares = vaccination_shares.reindex(backward_dates)
vaccination_shares = vaccination_shares.fillna(0)
# the first individuals to be vaccinated were nursing homes which are not
# in our synthetic data so we exclude the first 1% of vaccinations to
# be going to them.
vaccination_shares[vaccination_shares.cumsum() <= 0.01] = 0
# family physicians started vaccinating on April 6th (Tue after Easter)
# we assume that the number of vaccinations is constant to the weekday's
# mean when extrapolating into the future.
start_physicians = pd.Timestamp("2021-04-06")
after_start = vaccination_shares.loc[start_physicians:]
dayname_to_mean = after_start.groupby(after_start.index.day_name()).mean()
with open(produces["mean_vacc_share_per_day"], "w") as f:
yaml.dump(data=dayname_to_mean.to_dict(), stream=f)
start_date = vaccination_shares.index.max() + pd.Timedelta(days=1)
end_date = start_date + pd.Timedelta(weeks=12)
future_dates = pd.date_range(start_date, end_date)
future_day_names = future_dates.day_name()
future_values = future_day_names.to_series().replace(dayname_to_mean)
extension = pd.Series(future_values.values, index=future_dates)
labeled = [
("raw data", vaccination_shares),
("extension", extension),
]
fig, ax = _plot_labeled_series(labeled)
ax.axvline(
start_physicians,
label="Start of family physicians receiving Covid vaccines",
color="forestgreen",
)
plt.legend()
fig.savefig(produces["fig_vaccination_shares"])
plt.close()
extended = pd.concat([vaccination_shares, extension])
_test_extended(extended)
extended.to_pickle(produces["vaccination_shares_extended"])
[docs]def _clean_vaccination_data(df):
# drop rows below the last date
first_non_date_loc = df[df["Datum"] == "Gesamt"].index[0]
df = df.loc[: first_non_date_loc - 1].copy(deep=True)
df["date"] = pd.to_datetime(df["Datum"], dayfirst=True)
# check date conversion was correct
assert df["date"].min() == pd.Timestamp(year=2020, month=12, day=27)
assert df["date"].max() < pd.Timestamp(year=2021, month=12, day=31)
# sort_index is super important here because of the cumsum below!
df = df.set_index("date").sort_index()
try:
df["received_first_dose"] = df["mindestens einmal geimpft"].cumsum()
except KeyError:
df["received_first_dose"] = df["Erstimpfung"].cumsum()
df["share_with_first_dose"] = df["received_first_dose"] / POPULATION_GERMANY
assert df["share_with_first_dose"].sort_index().is_monotonic_increasing
assert (df.loc["2021-05-01":, "received_first_dose"] > 0.25).all()
return df
[docs]def _plot_series(sr, title, label=None):
fig, ax = plt.subplots(figsize=PLOT_SIZE)
sr = sr.loc[PLOT_START_DATE:PLOT_END_DATE]
sns.lineplot(x=sr.index, y=sr, label=label)
ax.set_title(title)
fig, ax = style_plot(fig, ax)
fig.tight_layout()
return fig, ax
[docs]def _plot_labeled_series(labeled):
title = "Actual and Extrapolated Share Receiving the Vaccination"
fig, ax = plt.subplots(figsize=PLOT_SIZE)
colors = get_colors("categorical", len(labeled))
for (label, sr), color in zip(labeled, colors):
sns.lineplot(
x=sr.loc[PLOT_START_DATE:PLOT_END_DATE].index,
y=sr.loc[PLOT_START_DATE:PLOT_END_DATE],
label=label,
linewidth=2,
color=color,
)
fig, ax = style_plot(fig, ax)
ax.set_title(title)
ax.set_ylabel("")
fig.tight_layout()
return fig, ax
[docs]def _test_extended(sr):
assert sr.index.is_monotonic, "index is not monotonic."
assert not sr.index.duplicated().any(), "Duplicate dates in Series."
assert (sr.index == pd.date_range(start=sr.index.min(), end=sr.index.max())).all()