Source code for src.prepare_data.task_prepare_work_shares

import numpy as np
import pandas as pd
import pytask

from src.config import BLD
from src.config import SRC


[docs]def clean_work_shares(path):
    work_shares = pd.read_csv(path)
    for col in ["men", "women"]:
        work_shares[col] = work_shares[col].astype(float) / 100
    work_shares[["age_lower", "age_upper"]] = work_shares["age_group"].apply(
        lambda x: interval_age_group(x)
    )
    work_shares.drop(columns=["age_group"], inplace=True)
    work_shares.rename(columns={"men": "male", "women": "female"}, inplace=True)

    work_shares["interval"] = work_shares.apply(
        lambda x: pd.Interval(x["age_lower"], x["age_upper"]), axis=1
    )
    work_shares.set_index("interval", inplace=True)

    return work_shares


[docs]def interval_age_group(x):
    if "-" in x:
        tup = x.split("-")
    elif ">=" in x:
        tup = x[2:], np.inf
    else:
        tup = float(x), float(x)
    tup = (float(tup[0]) - 0.5, float(tup[1]) + 0.5)
    return pd.Series(tup, index=["age_lower", "age_upper"])


@pytask.mark.depends_on(
    {
        "data": SRC
        / "original_data"
        / "population_structure"
        / "share_working_by_gender_2018.csv",
    }
[docs])
@pytask.mark.produces(BLD / "data" / "population_structure" / "working_shares.pkl")
def task_prepare_work_shares(depends_on, produces):
    work_by_age_and_gender = clean_work_shares(depends_on["data"])
    work_by_age_and_gender.to_pickle(produces)