Source code for src.prepare_data.task_prepare_work_shares

import numpy as np
import pandas as pd
import pytask

from src.config import BLD
from src.config import SRC


[docs]def clean_work_shares(path): work_shares = pd.read_csv(path) for col in ["men", "women"]: work_shares[col] = work_shares[col].astype(float) / 100 work_shares[["age_lower", "age_upper"]] = work_shares["age_group"].apply( lambda x: interval_age_group(x) ) work_shares.drop(columns=["age_group"], inplace=True) work_shares.rename(columns={"men": "male", "women": "female"}, inplace=True) work_shares["interval"] = work_shares.apply( lambda x: pd.Interval(x["age_lower"], x["age_upper"]), axis=1 ) work_shares.set_index("interval", inplace=True) return work_shares
[docs]def interval_age_group(x): if "-" in x: tup = x.split("-") elif ">=" in x: tup = x[2:], np.inf else: tup = float(x), float(x) tup = (float(tup[0]) - 0.5, float(tup[1]) + 0.5) return pd.Series(tup, index=["age_lower", "age_upper"])
@pytask.mark.depends_on( { "data": SRC / "original_data" / "population_structure" / "share_working_by_gender_2018.csv", }
[docs]) @pytask.mark.produces(BLD / "data" / "population_structure" / "working_shares.pkl") def task_prepare_work_shares(depends_on, produces): work_by_age_and_gender = clean_work_shares(depends_on["data"]) work_by_age_and_gender.to_pickle(produces)