Source code for src.prepare_data.task_download_data

from pathlib import Path

import pandas as pd
import pytask
import requests
import yaml
from tqdm import tqdm

from src.config import BLD

[docs]PARAMETRIZED_DOWNLOADS = [
    (
        "https://www.arcgis.com/sharing/rest/content/items/"
        "f10774f1c63e40168479a1feb6c7ca74/data",
        BLD / "data" / "raw_time_series" / "rki.csv",
    ),
    (
        "https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/"
        "Daten/Testzahlen-gesamt.xlsx;jsessionid="
        "3E410CDC013276FC28AD711373F5D82A.internet072?__blob=publicationFile",
        BLD / "data" / "raw_time_series" / "test_statistics.xlsx",
    ),
    (
        "https://www.gstatic.com/covid19/mobility/Region_Mobility_Report_CSVs.zip",
        BLD / "data" / "raw_time_series" / "google_mobility.zip",
    ),
    (
        "https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Daten/"
        "Impfquotenmonitoring.xlsx;jsessionid="
        "3E410CDC013276FC28AD711373F5D82A.internet072?__blob=publicationFile",
        BLD / "data" / "raw_time_series" / "vaccinations.xlsx",
    ),
    (
        "https://impfdashboard.de/static/data/germany_vaccinations_timeseries_v2.tsv",
        BLD / "data" / "raw_time_series" / "vaccinations_with_reason.tsv",
    ),
    (
        "https://impfdashboard.de/static/data/germany_deliveries_timeseries_v2.tsv",
        BLD / "data" / "raw_time_series" / "vaccination_deliveries.tsv",
    ),
    (
        "https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Daten/"
        "Klinische_Aspekte.xlsx?__blob=publicationFile",
        BLD / "data" / "raw_time_series" / "test_distribution.xlsx",
    ),
    (
        "https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/"
        "OxCGRT_latest.csv",
        BLD / "data" / "raw_time_series" / "stringency_data.csv",
    ),
    (
        "https://raw.githubusercontent.com/robert-koch-institut/SARS-CoV-2-Nowcasting_"
        "und_-R-Schaetzung/main/Nowcast_R_aktuell.csv",
        BLD / "data" / "raw_time_series" / "reproduction_number.csv",
    ),
]


[docs]def _is_download_necessary(path, response):
    """Check whether a download is necessary.

    There three criteria.

    1. If the file is missing, download it.
    2. The following two checks depend on each other.

       1. Some files have an entry in the header which specifies when the file was
          modified last. If the file has been modified, download it.
       2. If the header has no entry for the last modified date, we compare file sizes.
          If the file sizes do not match, the file is downloaded.

    """
    path_yaml = path.with_suffix(".yaml")
    if path_yaml.exists():
        last_modified_offline = pd.to_datetime(
            yaml.safe_load(path_yaml.read_text())["last_modified"]
        )
    else:
        last_modified_offline = None
    last_modified_online = pd.to_datetime(response.headers.get("last-modified", None))
    path.with_suffix(".yaml").write_text(
        yaml.dump({"last_modified": response.headers.get("last-modified", None)})
    )

    if not path.exists():
        is_necessary = True
        reason = f"The file {path.name} does not exist."
    elif (
        last_modified_online is not None
        and last_modified_online > last_modified_offline
    ):
        is_necessary = True
        reason = f"{path.name} has been modified online."
    elif last_modified_online is None:
        file_size_offline = path.stat().st_size
        file_size_online = int(response.headers.get("content-length", 0))

        if file_size_online != file_size_offline:
            is_necessary = True
            reason = f"File sizes differ for {path.name}"
        else:
            is_necessary = False
            reason = f"File {path.name} is already downloaded."
    else:
        is_necessary = False
        reason = f"File {path.name} is already downloaded."

    return is_necessary, reason


[docs]def _downloader(file: Path, url: str, response: int):
    """Download url in ``URLS[position]`` to disk with possible resumption.

    Parameters
    ----------
    file : str
        Path of file on disk
    url : str
        URL of file

    """
    # Establish connection
    r = requests.get(url, stream=True)

    # Set configuration
    block_size = 1024
    mode = "wb"

    with open(file, mode) as f:
        with tqdm(
            total=int(response.headers.get("content-length", 0)),
            unit="B",
            unit_scale=True,
            unit_divisor=1024,
            desc=file.name,
            initial=0,
            ascii=True,
            miniters=1,
        ) as pbar:
            for chunk in r.iter_content(32 * block_size):
                f.write(chunk)
                pbar.update(len(chunk))


[docs]def download_file(url: str, path: str):
    """Execute the correct download operation.

    If offline and online filesize differ, download the file again.

    """
    # Establish connection to header of file
    response = requests.head(url, headers={"Accept-Encoding": None})

    is_necessary, reason = _is_download_necessary(path, response)

    if is_necessary:
        _downloader(path, url, response)


@pytask.mark.parametrize("url, produces", PARAMETRIZED_DOWNLOADS)
[docs]def task_download_file(url, produces):
    download_file(url, produces)