Source code for src.prepare_data.task_prepare_vaccination_data

import matplotlib.pyplot as plt
import pandas as pd
import pytask
import seaborn as sns
import yaml
from sid.colors import get_colors

from src.config import BLD
from src.config import PLOT_END_DATE
from src.config import PLOT_SIZE
from src.config import PLOT_START_DATE
from src.config import POPULATION_GERMANY
from src.plotting.plotting import style_plot


plt.rcParams.update(
    {
        "axes.spines.right": False,
        "axes.spines.top": False,
        "legend.frameon": False,
    }
)


@pytask.mark.depends_on(
    {
        "data": BLD / "data" / "raw_time_series" / "vaccinations.xlsx",
    }
)
@pytask.mark.produces(
    {
        "vaccination_shares_raw": BLD
        / "data"
        / "vaccinations"
        / "vaccination_shares_raw.pkl",
        "vaccination_shares_extended": BLD
        / "data"
        / "vaccinations"
        / "vaccination_shares_extended.pkl",
        "fig_first_dose": BLD
        / "figures"
        / "data"
        / "share_of_individuals_with_first_vaccine.pdf",
        "fig_vaccination_shares": BLD
        / "figures"
        / "data"
        / "share_receiving_vaccination_per_day.pdf",
        "mean_vacc_share_per_day": BLD
        / "data"
[docs]        / "vaccinations"
        / "mean_vacc_share_per_day.yaml",
    }
)
def task_prepare_vaccination_data(depends_on, produces):
    df = pd.read_excel(depends_on["data"], sheet_name="Impfungen_proTag")
    df = _clean_vaccination_data(df)
    # this is for comparing with newspaper sites
    fig, ax = _plot_series(df["share_with_first_dose"], "")
    ax.set_xlim(pd.Timestamp(PLOT_START_DATE), pd.Timestamp(PLOT_END_DATE))
    fig.savefig(produces["fig_first_dose"])
    plt.close()

    vaccination_shares = df["share_with_first_dose"].diff().dropna()
    vaccination_shares.to_pickle(produces["vaccination_shares_raw"])

    # extend data to 2020.
    backward_dates = pd.date_range("2020-01-01", vaccination_shares.index.max())
    vaccination_shares = vaccination_shares.reindex(backward_dates)
    vaccination_shares = vaccination_shares.fillna(0)

    # the first individuals to be vaccinated were nursing homes which are not
    # in our synthetic data so we exclude the first 1% of vaccinations to
    # be going to them.
    vaccination_shares[vaccination_shares.cumsum() <= 0.01] = 0

    # family physicians started vaccinating on April 6th (Tue after Easter)
    # we assume that the number of vaccinations is constant to the weekday's
    # mean when extrapolating into the future.
    start_physicians = pd.Timestamp("2021-04-06")
    after_start = vaccination_shares.loc[start_physicians:]

    dayname_to_mean = after_start.groupby(after_start.index.day_name()).mean()
    with open(produces["mean_vacc_share_per_day"], "w") as f:
        yaml.dump(data=dayname_to_mean.to_dict(), stream=f)

    start_date = vaccination_shares.index.max() + pd.Timedelta(days=1)
    end_date = start_date + pd.Timedelta(weeks=12)
    future_dates = pd.date_range(start_date, end_date)
    future_day_names = future_dates.day_name()
    future_values = future_day_names.to_series().replace(dayname_to_mean)
    extension = pd.Series(future_values.values, index=future_dates)

    labeled = [
        ("raw data", vaccination_shares),
        ("extension", extension),
    ]
    fig, ax = _plot_labeled_series(labeled)
    ax.axvline(
        start_physicians,
        label="Start of family physicians receiving Covid vaccines",
        color="forestgreen",
    )
    plt.legend()

    fig.savefig(produces["fig_vaccination_shares"])
    plt.close()

    extended = pd.concat([vaccination_shares, extension])
    _test_extended(extended)
    extended.to_pickle(produces["vaccination_shares_extended"])


[docs]def _clean_vaccination_data(df):
    # drop rows below the last date
    first_non_date_loc = df[df["Datum"] == "Gesamt"].index[0]
    df = df.loc[: first_non_date_loc - 1].copy(deep=True)

    df["date"] = pd.to_datetime(df["Datum"], dayfirst=True)

    # check date conversion was correct
    assert df["date"].min() == pd.Timestamp(year=2020, month=12, day=27)
    assert df["date"].max() < pd.Timestamp(year=2021, month=12, day=31)

    # sort_index is super important here because of the cumsum below!
    df = df.set_index("date").sort_index()

    try:
        df["received_first_dose"] = df["mindestens einmal geimpft"].cumsum()
    except KeyError:
        df["received_first_dose"] = df["Erstimpfung"].cumsum()
    df["share_with_first_dose"] = df["received_first_dose"] / POPULATION_GERMANY

    assert df["share_with_first_dose"].sort_index().is_monotonic_increasing
    assert (df.loc["2021-05-01":, "received_first_dose"] > 0.25).all()

    return df


[docs]def _plot_series(sr, title, label=None):
    fig, ax = plt.subplots(figsize=PLOT_SIZE)
    sr = sr.loc[PLOT_START_DATE:PLOT_END_DATE]
    sns.lineplot(x=sr.index, y=sr, label=label)
    ax.set_title(title)
    fig, ax = style_plot(fig, ax)
    fig.tight_layout()
    return fig, ax


[docs]def _plot_labeled_series(labeled):
    title = "Actual and Extrapolated Share Receiving the Vaccination"
    fig, ax = plt.subplots(figsize=PLOT_SIZE)
    colors = get_colors("categorical", len(labeled))
    for (label, sr), color in zip(labeled, colors):
        sns.lineplot(
            x=sr.loc[PLOT_START_DATE:PLOT_END_DATE].index,
            y=sr.loc[PLOT_START_DATE:PLOT_END_DATE],
            label=label,
            linewidth=2,
            color=color,
        )
    fig, ax = style_plot(fig, ax)
    ax.set_title(title)
    ax.set_ylabel("")
    fig.tight_layout()
    return fig, ax


[docs]def _test_extended(sr):
    assert sr.index.is_monotonic, "index is not monotonic."
    assert not sr.index.duplicated().any(), "Duplicate dates in Series."
    assert (sr.index == pd.date_range(start=sr.index.min(), end=sr.index.max())).all()