Source code for src.prepare_data.task_download_data

from pathlib import Path

import pandas as pd
import pytask
import requests
import yaml
from tqdm import tqdm

from src.config import BLD

[docs]PARAMETRIZED_DOWNLOADS = [ ( "https://www.arcgis.com/sharing/rest/content/items/" "f10774f1c63e40168479a1feb6c7ca74/data", BLD / "data" / "raw_time_series" / "rki.csv", ), ( "https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/" "Daten/Testzahlen-gesamt.xlsx;jsessionid=" "3E410CDC013276FC28AD711373F5D82A.internet072?__blob=publicationFile", BLD / "data" / "raw_time_series" / "test_statistics.xlsx", ), ( "https://www.gstatic.com/covid19/mobility/Region_Mobility_Report_CSVs.zip", BLD / "data" / "raw_time_series" / "google_mobility.zip", ), ( "https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Daten/" "Impfquotenmonitoring.xlsx;jsessionid=" "3E410CDC013276FC28AD711373F5D82A.internet072?__blob=publicationFile", BLD / "data" / "raw_time_series" / "vaccinations.xlsx", ), ( "https://impfdashboard.de/static/data/germany_vaccinations_timeseries_v2.tsv", BLD / "data" / "raw_time_series" / "vaccinations_with_reason.tsv", ), ( "https://impfdashboard.de/static/data/germany_deliveries_timeseries_v2.tsv", BLD / "data" / "raw_time_series" / "vaccination_deliveries.tsv", ), ( "https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Daten/" "Klinische_Aspekte.xlsx?__blob=publicationFile", BLD / "data" / "raw_time_series" / "test_distribution.xlsx", ), ( "https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/" "OxCGRT_latest.csv", BLD / "data" / "raw_time_series" / "stringency_data.csv", ), ( "https://raw.githubusercontent.com/robert-koch-institut/SARS-CoV-2-Nowcasting_" "und_-R-Schaetzung/main/Nowcast_R_aktuell.csv", BLD / "data" / "raw_time_series" / "reproduction_number.csv",
), ]
[docs]def _is_download_necessary(path, response): """Check whether a download is necessary. There three criteria. 1. If the file is missing, download it. 2. The following two checks depend on each other. 1. Some files have an entry in the header which specifies when the file was modified last. If the file has been modified, download it. 2. If the header has no entry for the last modified date, we compare file sizes. If the file sizes do not match, the file is downloaded. """ path_yaml = path.with_suffix(".yaml") if path_yaml.exists(): last_modified_offline = pd.to_datetime( yaml.safe_load(path_yaml.read_text())["last_modified"] ) else: last_modified_offline = None last_modified_online = pd.to_datetime(response.headers.get("last-modified", None)) path.with_suffix(".yaml").write_text( yaml.dump({"last_modified": response.headers.get("last-modified", None)}) ) if not path.exists(): is_necessary = True reason = f"The file {path.name} does not exist." elif ( last_modified_online is not None and last_modified_online > last_modified_offline ): is_necessary = True reason = f"{path.name} has been modified online." elif last_modified_online is None: file_size_offline = path.stat().st_size file_size_online = int(response.headers.get("content-length", 0)) if file_size_online != file_size_offline: is_necessary = True reason = f"File sizes differ for {path.name}" else: is_necessary = False reason = f"File {path.name} is already downloaded." else: is_necessary = False reason = f"File {path.name} is already downloaded." return is_necessary, reason
[docs]def _downloader(file: Path, url: str, response: int): """Download url in ``URLS[position]`` to disk with possible resumption. Parameters ---------- file : str Path of file on disk url : str URL of file """ # Establish connection r = requests.get(url, stream=True) # Set configuration block_size = 1024 mode = "wb" with open(file, mode) as f: with tqdm( total=int(response.headers.get("content-length", 0)), unit="B", unit_scale=True, unit_divisor=1024, desc=file.name, initial=0, ascii=True, miniters=1, ) as pbar: for chunk in r.iter_content(32 * block_size): f.write(chunk) pbar.update(len(chunk))
[docs]def download_file(url: str, path: str): """Execute the correct download operation. If offline and online filesize differ, download the file again. """ # Establish connection to header of file response = requests.head(url, headers={"Accept-Encoding": None}) is_necessary, reason = _is_download_necessary(path, response) if is_necessary: _downloader(path, url, response)
@pytask.mark.parametrize("url, produces", PARAMETRIZED_DOWNLOADS)
[docs]def task_download_file(url, produces): download_file(url, produces)