Source code for src.create_initial_states.add_weekly_ids

import itertools as it
from typing import Dict
from typing import List

import numba as nb
import numpy as np
import pandas as pd
from numba.typed import List as NumbaList
from sid.config import DTYPE_INDEX
from sid.contacts import boolean_choice
from sid.contacts import choose_other_group
from sid.contacts import choose_other_individual
from sid.shared import factorize_assortative_variables

from src.create_initial_states.create_group_transition_probs import (
    create_group_transition_probs,
)
from src.shared import draw_from_distribution_for_subset


[docs]def add_weekly_ids(
    states, weekly_dist, seed, query, col_prefix, county_assortativeness
):
    """Add a column for every possible weekly contact.

    We draw from the number of weekly contacts distribution in how
    many weekly contact models a person participates and then randomly
    choose in which column she'll be paired with someone. Lastly, for each
    column we match people who participate in the respective contact model
    with a specified geographic assortativeness.

    Args:
        states (pandas.DataFrame): sid states DataFrame
        weekly_dist (pandas.Series): the index is the support of the
            number of weekly contacts that are possible, the values are
            the frequencies in the synthetic population we aim for of each
            number of weekly contacts. One pair id column will be created
            for every possible contact.
        seed (int): seed.
        query (str): query which subset of the population participates.
            If None, everyone is grouped. (e.g. "occupation =='working'")
        col_prefix (str): prefix for the columns to be created.
        county_assortativeness (float): share of weekly contacts that
            should belong to the same county.

    Returns:
        states (pandas.DataFrame): sit states DataFrame with additional columns
            specifying the weekly work group pairs.

    """
    seed = it.count(seed)
    weekly_ids = pd.DataFrame(index=states.index)
    max_contacts = weekly_dist.index.max()

    for i in range(max_contacts):
        weekly_ids[f"{col_prefix}_{i}"] = -1

    nr_of_weekly_contacts = draw_from_distribution_for_subset(
        states=states,
        distribution=weekly_dist,
        query=query,
        seed=next(seed),
        outside_val=0,
    ).to_numpy()

    weekly_id_cols = []
    for i in range(max_contacts):
        col_name = f"{col_prefix}_{i}"
        weekly_ids[col_name] = np.nan
        weekly_id_cols.append(col_name)

    weekly_ids[weekly_id_cols] = _create_pairs(
        states=states,
        nr_of_weekly_contacts=nr_of_weekly_contacts,
        county_assortativeness=county_assortativeness,
        seed=next(seed),
    )
    return weekly_ids


[docs]def _create_pairs(states, nr_of_weekly_contacts, county_assortativeness, seed):
    group_codes_per_individual, _ = factorize_assortative_variables(states, ["county"])
    indexer = _create_group_indexer(states, ["county"])
    fake_params = pd.DataFrame(
        data=county_assortativeness,
        columns=["value"],
        index=pd.MultiIndex.from_tuples(
            [("assortative_matching", "fake_model", "county")]
        ),
    )
    model_name = "fake_model"
    first_stage_cum_probs = create_group_transition_probs(
        states=states, assort_by=["county"], params=fake_params, model_name=model_name
    )
    to_match = _create_participation_array(nr_of_weekly_contacts, seed=seed + 1)
    pair_array = _create_pairs_numba(
        to_match=to_match,
        indexer=indexer,
        first_stage_cum_probs=first_stage_cum_probs,
        group_codes_per_individual=group_codes_per_individual,
        seed=seed,
    )
    return pair_array


@nb.njit
[docs]def _create_pairs_numba(
    to_match, indexer, first_stage_cum_probs, group_codes_per_individual, seed
):
    """
    Args:
        to_match (np.ndarry): 2d boolean array with one row per individual
            and one column sub-contact model.
        indexer (numba.List): Numba list that maps id of county to a numpy array
            with the row positions of all individuals from that county.
        first_stage_cum_probs(numpy.ndarray): Array of shape n_group, n_groups.
            cum_probs[i, j] is the probability that an individual from group i
            meets someone from group j or lower.
        group (np.ndarray): 1d array with assortative matching group ids,
            coded as integers.

    Returns:
        pairs_of_workers (np.ndarray): 2d integer array with meeting ids.

    """
    np.random.seed(seed)
    unique_group_codes = np.arange(len(first_stage_cum_probs))
    to_match = to_match.copy()
    out = np.full(to_match.shape, -1)
    n_obs, n_models = to_match.shape
    for m in range(n_models):
        meeting_id = 0
        for i in range(n_obs):
            if to_match[i, m]:
                group_i = group_codes_per_individual[i]
                group_j = choose_other_group(
                    unique_group_codes, first_stage_cum_probs[group_i]
                )
                group_j_indices = indexer[group_j]
                weights = to_match[group_j_indices, m].astype(np.float64)
                j = choose_other_individual(group_j_indices, weights)
                if j != -1:
                    to_match[i, m] = False
                    to_match[j, m] = False
                    out[i, m] = meeting_id
                    out[j, m] = meeting_id
                    meeting_id += 1
    return out


@nb.njit
[docs]def _create_participation_array(nr_of_contacts, seed):
    """Draw randomly in which pairs an individual participates.

    Args:
        nr_of_contacts (pandas.Series): number of contacts, i.e. number of pairs
            in which every individual is supposed to participate. The specific
            pair columns will be randomly drawn here.
        seed (int): seed

    Returns:
        participation (numpy.ndarray): boolean array of shape
            (len(nr_of_contacts), nr_of_contacts.max()).
            If participation[i, mod] is True, individual i was drawn
            to participate in mod.

    """
    np.random.seed(seed)
    n_models = nr_of_contacts.max()
    n_obs = len(nr_of_contacts)
    success_prob = nr_of_contacts / n_models
    participation_array = np.full((n_obs, n_models), False)
    for i in range(n_obs):
        prob_i = success_prob[i]
        for m in range(n_models):
            participation_array[i, m] = boolean_choice(prob_i)

    return participation_array


[docs]def _create_group_indexer(
    states: pd.DataFrame, assort_by: Dict[str, List[str]]
) -> nb.typed.List:
    """Create the group indexer.

    The indexer is a list where the positions correspond to the group number defined by
    assortative variables. The values inside the list are one-dimensional integer arrays
    containing the indices of states belonging to the group.

    If there are no assortative variables, all individuals are assigned to a single
    group with code 0 and the indexer is a list where the first position contains all
    indices of states.

    For efficiency reasons, we assign each group a number instead of identifying by
    the values of the assort_by variables directly.

    Note: This function is from sid commit 206886a14eeb3257deb71db91aba4e7fb2385fc2.

    Args:
        states (pandas.DataFrame): The states.
        assort_by (List[str]): List of variables that influence matching probabilities.

    Returns:
        indexer (numba.typed.List): The i_th entry are the indices of the i_th group.

    """
    states = states.reset_index()
    if assort_by:
        groups = states.groupby(assort_by).groups
        _, group_codes_values = factorize_assortative_variables(states, assort_by)

        indexer = NumbaList()
        for group in group_codes_values:
            # the keys of groups are not tuples if there was just one assort_by variable
            # but the group_codes_values are.
            group = group[0] if isinstance(group, tuple) and len(group) == 1 else group
            indexer.append(groups[group].to_numpy(dtype=DTYPE_INDEX))

    else:
        indexer = NumbaList()
        indexer.append(states.index.to_numpy(DTYPE_INDEX))

    return indexer