Source code for src.prepare_data.task_prepare_virus_variant_data
import pandas as pd
import pytask
from src.config import BLD
from src.config import SRC
from src.testing.shared import get_date_from_year_and_week
[docs]STRAIN_FILES = {
"rki_strains": BLD / "data" / "virus_strains" / "rki_strains.csv",
"virus_shares_dict": BLD / "data" / "virus_strains" / "virus_shares_dict.pkl",
}
@pytask.mark.depends_on(
{
"rki": SRC / "original_data" / "virus_strains_rki.csv",
"testing_shared.py": SRC / "testing" / "shared.py",
}
[docs])
@pytask.mark.produces(STRAIN_FILES)
def task_prepare_virus_variant_data(depends_on, produces):
rki = pd.read_csv(depends_on["rki"])
rki = _prepare_rki_data(rki)
rki.to_csv(produces["rki_strains"])
b117 = rki["share_b117"]
b117.name = "b117"
delta = rki["share_delta"]
delta.name = "delta"
virus_shares = {
"base_strain": 1 - b117 - delta,
"b117": b117,
"delta": delta,
}
pd.to_pickle(virus_shares, produces["virus_shares_dict"])
[docs]def _prepare_rki_data(df):
df = df[df["week"].notnull()].copy(deep=True)
df["year"] = 2021
df["date"] = df.apply(get_date_from_year_and_week, axis=1)
df = df.set_index("date").astype(float)
for col in df:
if col.startswith("pct_"):
df[f"share_{col.replace('pct_', '')}"] = df[col] / 100
share_cols = [col for col in df if col.startswith("share_")]
df = df[share_cols]
dates = pd.date_range(df.index.min(), df.index.max())
# no division by 7 necessary because the data only contains shares.
df = df.reindex(dates).interpolate()
# add zero part for 2020
df = df.reindex(pd.date_range("2020-03-01", df.index.max()), fill_value=0.0)
df.index.name = "date"
return df