Source code for src.prepare_data.task_prepare_county_information

import pandas as pd
import pytask

from src.config import BLD
from src.config import SRC
from src.prepare_data.task_prepare_rki_data import TRANSLATE_STATES


[docs]def _prepare_general_data(paths):
    df = pd.read_csv(
        paths[0],
        skiprows=7,
        skipfooter=4,
        sep=";",
        encoding="latin_1",
        engine="python",
    )

    df = df.rename(
        columns={
            "Unnamed: 0": "id",
            "Unnamed: 1": "name",
            "Insgesamt": "population",
            "männlich": "male",
            "weiblich": "female",
        }
    )

    df_berlin = pd.read_csv(paths[1], sep=";")

    df = df.append(df_berlin)
    df["id"] = df["id"].astype(str)

    df["name"] = df["name"].str.lstrip(" ")
    df["name"] = df["name"].replace(TRANSLATE_STATES)

    return df


[docs]def _prepare_federal_states(df):
    states = df.loc[df["id"].str.len() == 2].copy().drop(index=0).reset_index(drop=True)
    states[["population", "male", "female", "id"]] = states[
        ["population", "male", "female", "id"]
    ].astype(int)

    states["weight"] = states.population / states.population.sum()

    return states


[docs]def _prepare_counties(df, states):
    # Include Hamburg by converting its state id to Kreisschlüssel.
    df.id = df.id.replace({"02": "02000"})

    counties = df.loc[df["id"].str.len() == 5].copy()
    counties["state"] = counties["id"].str[:2]

    columns = ["population", "male", "female", "id", "state"]
    for col in columns:
        counties[col] = pd.to_numeric(counties[col], errors="coerce")

    counties = counties.dropna()
    counties[columns] = counties[columns].astype(int)
    counties["weight"] = counties.population / counties.population.sum()

    counties = (
        counties.merge(
            states[["id", "name"]],
            left_on="state",
            right_on="id",
            validate="m:1",
            suffixes=("", "_y"),
        )
        .drop(columns=["id_y", "state"])
        .rename(columns={"name_y": "state"})
    )

    counties["name"] = counties["name"].str.strip()
    counties["state"] = counties["state"].str.strip()

    return counties


@pytask.mark.depends_on(
    [
        SRC / "original_data" / "population_structure" / "population.csv",
        SRC / "original_data" / "population_structure" / "population_berlin.csv",
        SRC / "prepare_data" / "task_prepare_rki_data.py",
        SRC / "config.py",
    ]
)
@pytask.mark.produces(
    [
[docs]        BLD / "data" / "population_structure" / "federal_states.parquet",
        BLD / "data" / "population_structure" / "counties.parquet",
    ]
)
def task_prepare_geographical_data_de(depends_on, produces):
    df = _prepare_general_data(depends_on)

    states = _prepare_federal_states(df)
    states.to_parquet(produces[0])

    counties = _prepare_counties(df, states)
    counties.to_parquet(produces[1])