From 1dcc1c7a61a13b8e4708549ffb128282b7276f1e Mon Sep 17 00:00:00 2001 From: claude-marie Date: Fri, 3 Apr 2026 12:51:03 +0200 Subject: [PATCH] seasonality_case --- .../code/snt_seasonality_cases.ipynb | 2094 ++++++++--------- .../utils/snt_seasonality_cases.r | 170 ++ 2 files changed, 1187 insertions(+), 1077 deletions(-) create mode 100644 pipelines/snt_seasonality_cases/utils/snt_seasonality_cases.r diff --git a/pipelines/snt_seasonality_cases/code/snt_seasonality_cases.ipynb b/pipelines/snt_seasonality_cases/code/snt_seasonality_cases.ipynb index 669677e..87663d7 100644 --- a/pipelines/snt_seasonality_cases/code/snt_seasonality_cases.ipynb +++ b/pipelines/snt_seasonality_cases/code/snt_seasonality_cases.ipynb @@ -1,1106 +1,1046 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "5eebc540-e973-497e-8427-e73d546fdd09", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + "cells": [ + { + "cell_type": "markdown", + "id": "5eebc540-e973-497e-8427-e73d546fdd09", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Case Seasonality Pipeline\n", + "\n", + "This pipeline classifies administrative units (`ADM2`) as **seasonal** or **non-seasonal** based on confirmed malaria case patterns, determines the **duration** of their transmission season, and identifies the respective **onset month**.\n", + "\n", + "### Methodology\n", + "\n", + "The pipeline employs a four-step hierarchical method:\n", + "\n", + "#### 1. Identify \"start\" month\n", + "Evaluates every specific month in the dataset (per district and year) to determine if it marks the beginning of a concentrated transmission period.\n", + "- **Logic**: For each month, compute a forward-looking proportion: the ratio of cases in the next n-month block to the annual total.\n", + "- **Denominator**: Can use either a 12-month forward-looking sliding window (WHO approach) or calendar year (Jan-Dec).\n", + "- **Threshold**: If this proportion exceeds `threshold_for_seasonality` (e.g., 60%), the month is flagged as a valid \"start\" month.\n", + "\n", + "#### 2. Classify ADM2 as \"Seasonal\" or \"Non-seasonal\"\n", + "Aggregates month-level flags to determine if the district consistently experiences a transmission season.\n", + "- **Logic**: Calculate the proportion of years that a specific month was flagged as a \"start\" in Step 1.\n", + "- **Consistency**: If this proportion exceeds `threshold_proportion_seasonal_years` (e.g., 70%), the district is classified as \"Seasonal\".\n", + "\n", + "#### 3. Determine season duration\n", + "Resolves cases where a district qualifies for multiple block durations by selecting the minimum duration.\n", + "- **Output**: `SEASONAL_BLOCK_DURATION_CASES` - the shortest window with the required case proportion.\n", + "\n", + "#### 4. Determine season onset\n", + "Identifies the single official start month using the mode (most frequent value) across years.\n", + "- **Output**: `SEASONAL_BLOCK_START_MONTH_CASES`\n", + "\n", + "---\n", + "\n", + "## Preliminaries" + ] }, - "tags": [] - }, - "source": [ - "## Case Seasonality Pipeline\n", - "\n", - "This pipeline classifies administrative units (`ADM2`) as **seasonal** or **non-seasonal** based on confirmed malaria case patterns, determines the **duration** of their transmission season, and identifies the respective **onset month**.\n", - "\n", - "### Methodology\n", - "\n", - "The pipeline employs a four-step hierarchical method:\n", - "\n", - "#### 1. Identify \"start\" month\n", - "Evaluates every specific month in the dataset (per district and year) to determine if it marks the beginning of a concentrated transmission period.\n", - "- **Logic**: For each month, compute a forward-looking proportion: the ratio of cases in the next n-month block to the annual total.\n", - "- **Denominator**: Can use either a 12-month forward-looking sliding window (WHO approach) or calendar year (Jan-Dec).\n", - "- **Threshold**: If this proportion exceeds `threshold_for_seasonality` (e.g., 60%), the month is flagged as a valid \"start\" month.\n", - "\n", - "#### 2. Classify ADM2 as \"Seasonal\" or \"Non-seasonal\"\n", - "Aggregates month-level flags to determine if the district consistently experiences a transmission season.\n", - "- **Logic**: Calculate the proportion of years that a specific month was flagged as a \"start\" in Step 1.\n", - "- **Consistency**: If this proportion exceeds `threshold_proportion_seasonal_years` (e.g., 70%), the district is classified as \"Seasonal\".\n", - "\n", - "#### 3. Determine season duration\n", - "Resolves cases where a district qualifies for multiple block durations by selecting the minimum duration.\n", - "- **Output**: `SEASONAL_BLOCK_DURATION_CASES` - the shortest window with the required case proportion.\n", - "\n", - "#### 4. Determine season onset\n", - "Identifies the single official start month using the mode (most frequent value) across years.\n", - "- **Output**: `SEASONAL_BLOCK_START_MONTH_CASES`\n", - "\n", - "---\n", - "\n", - "## Preliminaries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "273b05d8-d287-4acc-bd43-5ba6642bd9fa", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# install.packages(\"fpp3\", repos = \"https://cloud.r-project.org\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f7789c67", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Clear environment\n", - "rm(list=ls())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18d197b6-9de8-4e4b-bc4e-8b452de67287", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "273b05d8-d287-4acc-bd43-5ba6642bd9fa", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# install.packages(\"fpp3\", repos = \"https://cloud.r-project.org\")" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Global settings\n", - "options(scipen=999)\n", - "\n", - "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", - "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a6915379-108e-4405-b553-b074aad447d6", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "f7789c67", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Clear environment\n", + "rm(list=ls())" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Paths\n", - "ROOT_PATH <- '~/workspace'\n", - "CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')\n", - "CODE_PATH <- file.path(ROOT_PATH, 'code')\n", - "DATA_PATH <- file.path(ROOT_PATH, 'data')\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'seasonality_cases')\n", - "INTERMEDIATE_RESULTS_PATH <- file.path(OUTPUT_DATA_PATH, \"intermediate_results\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f94d1e6c-0675-4349-b6e0-a28197c8c9e4", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load utils\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "180b93e7-61af-4981-863f-593b755968bd", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# List required pcks\n", - "required_packages <- c(\n", - " \"jsonlite\",\n", - " \"data.table\",\n", - " \"ggplot2\",\n", - " \"fpp3\",\n", - " \"arrow\",\n", - " \"glue\",\n", - " \"sf\",\n", - " \"RColorBrewer\",\n", - " \"httr\",\n", - " \"reticulate\"\n", - ")\n", - "\n", - "# Execute function\n", - "install_and_load(required_packages)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "609cd062-d7d9-42de-976b-10f8a0bfc18a", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "18d197b6-9de8-4e4b-bc4e-8b452de67287", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Global settings\n", + "options(scipen=999)\n", + "\n", + "Sys.setenv(PROJ_LIB = \"/opt/conda/share/proj\")\n", + "Sys.setenv(GDAL_DATA = \"/opt/conda/share/gdal\")" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "reticulate::py_config()$python\n", - "openhexa <- import(\"openhexa.sdk\")\n", - "\n", - "# Check that compute_month_seasonality() supports the required parameter\n", - "if (!(\"use_calendar_year_denominator\" %in% names(formals(compute_month_seasonality)))) {\n", - " error_msg <- paste0(\n", - " \"Error: The function compute_month_seasonality() does not support the parameter 'use_calendar_year_denominator'. \",\n", - " \"Please ensure that the snt_utils.r file is updated to the latest version.\"\n", - " )\n", - " log_msg(error_msg, level = \"error\")\n", - " stop(error_msg)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "458b3d65-cc7e-41bc-95fd-7011dcd5528f", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "a6915379-108e-4405-b553-b074aad447d6", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Paths\n", + "ROOT_PATH <- '~/workspace'\n", + "PIPELINE_PATH <- file.path(ROOT_PATH, 'pipelines', 'snt_seasonality_cases')" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load SNT config\n", - "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", - "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", - " error = function(e) {\n", - " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", - " cat(msg) \n", - " stop(msg) \n", - " })\n", - "\n", - "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", - "log_msg(msg)\n", - "\n", - "# Set config variables\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "print(paste(\"Country code: \", COUNTRY_CODE))" - ] - }, - { - "cell_type": "markdown", - "id": "804a1bd1-26c8-4f6a-af35-3eba64fe0741", - "metadata": {}, - "source": [ - "## Globals and parameters" - ] - }, - { - "cell_type": "markdown", - "id": "414f9ee0-5264-43c4-992f-cff6c719d65c", - "metadata": {}, - "source": [ - "**Parameters**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fb82560d-c123-4c54-bfa1-fb5f05e4ad69", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "minimum_periods <- as.integer(36)\n", - "maximum_proportion_missings_overall <- 0.1\n", - "maximum_proportion_missings_per_district <- 0.2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9024226d-5845-48a0-8ae4-e7b5a8d11988", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "f94d1e6c-0675-4349-b6e0-a28197c8c9e4", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load utils and bootstrap context\n", + "source(file.path(PIPELINE_PATH, \"utils\", \"snt_seasonality_cases.r\"))\n", + "\n", + "setup_ctx <- bootstrap_seasonality_cases_context(\n", + " root_path = ROOT_PATH,\n", + " required_packages = c(\n", + " \"jsonlite\", \"data.table\", \"ggplot2\", \"fpp3\", \"arrow\", \"glue\",\n", + " \"sf\", \"RColorBrewer\", \"httr\", \"reticulate\"\n", + " )\n", + ")\n", + "\n", + "CONFIG_PATH <- setup_ctx$CONFIG_PATH\n", + "OUTPUT_DATA_PATH <- setup_ctx$OUTPUT_DATA_PATH\n", + "INTERMEDIATE_RESULTS_PATH <- setup_ctx$INTERMEDIATE_RESULTS_PATH" + ] }, - "tags": [ - "parameters" - ], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Fallback parameter values for local/dev execution\n", - "# When run via pipeline, these are injected by Papermill in the first cell\n", - "if (!exists(\"minimum_month_block_size\")) {\n", - " minimum_month_block_size <- as.integer(3)\n", - "}\n", - "if (!exists(\"maximum_month_block_size\")) {\n", - " maximum_month_block_size <- as.integer(5)\n", - "}\n", - "if (!exists(\"threshold_for_seasonality\")) {\n", - " threshold_for_seasonality <- 0.6\n", - "}\n", - "if (!exists(\"threshold_proportion_seasonal_years\")) {\n", - " threshold_proportion_seasonal_years <- 0.5\n", - "}\n", - "if (!exists(\"use_calendar_year_denominator\")) {\n", - " use_calendar_year_denominator <- FALSE\n", - "}\n", - "\n", - "# Ensure correct types\n", - "minimum_month_block_size <- as.integer(minimum_month_block_size)\n", - "maximum_month_block_size <- as.integer(maximum_month_block_size)\n", - "\n", - "# Log parameter values\n", - "log_msg(paste(\"Minimum month block size:\", minimum_month_block_size))\n", - "log_msg(paste(\"Maximum month block size:\", maximum_month_block_size))\n", - "log_msg(paste(\"Threshold for seasonality:\", threshold_for_seasonality))\n", - "log_msg(paste(\"Threshold proportion seasonal years:\", threshold_proportion_seasonal_years))\n", - "log_msg(paste(\"Use calendar year denominator:\", use_calendar_year_denominator))" - ] - }, - { - "cell_type": "markdown", - "id": "0b2d3bb6-6351-4f32-92de-44a6579b6630", - "metadata": {}, - "source": [ - "**Fixed routine formatting columns**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "90b27881-b25d-4cb3-8b2f-4dd1b395bdee", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "180b93e7-61af-4981-863f-593b755968bd", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Packages are loaded in bootstrap_seasonality_cases_context()." + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Global variables\n", - "type_of_seasonality <- \"cases\"\n", - "formatted_threshold_for_seasonality <- sprintf(\"%d%%\", round(threshold_for_seasonality * 100))\n", - "data_source <- \"DHIS2\"\n", - "original_values_col <- \"CONF\"\n", - "\n", - "# space and time columns\n", - "admin_level <- 'ADM2'\n", - "admin_id_col <- paste(admin_level, toupper('id'), sep = '_')\n", - "admin_name_col <- paste(admin_level, toupper('name'), sep = '_')\n", - "year_col <- 'YEAR'\n", - "month_col <- 'MONTH'\n", - "period_cols <- c(year_col, month_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "473308f4-4630-4d9e-82a9-b2b4fc9134db", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "609cd062-d7d9-42de-976b-10f8a0bfc18a", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "reticulate::py_config()$python\n", + "\n", + "# Check that compute_month_seasonality() supports the required parameter\n", + "if (!(\"use_calendar_year_denominator\" %in% names(formals(compute_month_seasonality)))) {\n", + " error_msg <- paste0(\n", + " \"Error: The function compute_month_seasonality() does not support the parameter 'use_calendar_year_denominator'. \",\n", + " \"Please ensure that the snt_utils.r file is updated to the latest version.\"\n", + " )\n", + " log_msg(error_msg, level = \"error\")\n", + " stop(error_msg)\n", + "}" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "possible_month_block_sizes <- as.integer(minimum_month_block_size:maximum_month_block_size)\n", - "formatted_threshold_for_seasonality <- sprintf(\"%d%%\", round(threshold_for_seasonality * 100))\n", - "print(paste(\"Formatted threshold :\",formatted_threshold_for_seasonality))" - ] - }, - { - "cell_type": "markdown", - "id": "86f492f3-5634-4987-a2b8-23014aba5d51", - "metadata": {}, - "source": [ - "## Load data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "623480ee-4310-4ead-a8c8-bf294527c814", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load spatial file from dataset\n", - "spatial_data_filename <- paste(COUNTRY_CODE, \"shapes.geojson\", sep = \"_\")\n", - "spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)\n", - "log_msg(glue(\"File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f766ea1-dced-4143-a5be-fdc51da4bd8d", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "458b3d65-cc7e-41bc-95fd-7011dcd5528f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load SNT config\n", + "CONFIG_FILE_NAME <- \"SNT_config.json\"\n", + "config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },\n", + " error = function(e) {\n", + " msg <- paste0(\"Error while loading configuration\", conditionMessage(e)) \n", + " cat(msg) \n", + " stop(msg) \n", + " })\n", + "\n", + "msg <- paste0(\"SNT configuration loaded from : \", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) \n", + "log_msg(msg)\n", + "\n", + "# Set config variables\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "print(paste(\"Country code: \", COUNTRY_CODE))" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load routine data from dataset\n", - "case_data_filename <- paste(COUNTRY_CODE, \"routine.parquet\", sep = \"_\")\n", - "original_dt <- get_latest_dataset_file_in_memory(dhis2_dataset, case_data_filename)\n", - "log_msg(glue(\"File {case_data_filename} successfully loaded from dataset version: {dhis2_dataset}\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7b769deb-52e5-471d-9950-ac431dd8cf03", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Columns formatting\n", - "admin_data <- st_drop_geometry(spatial_data)\n", - "setDT(admin_data)\n", - "common_cols <- names(admin_data)\n", - "\n", - "seasonality_col <- glue('SEASONALITY', toupper(type_of_seasonality), .sep = \"_\")\n", - "season_duration_col <- glue('SEASONAL_BLOCK_DURATION', toupper(type_of_seasonality), .sep = \"_\")\n", - "season_start_month_col <- glue('SEASONAL_BLOCK_START_MONTH', toupper(type_of_seasonality), .sep = \"_\")\n", - "cases_proportion_col <- 'CASES_PROPORTION'\n", - "final_table_cols <- c(names(admin_data), seasonality_col, season_duration_col, season_start_month_col, cases_proportion_col)\n", - "print(final_table_cols)" - ] - }, - { - "cell_type": "markdown", - "id": "0d329af2-f544-4ee2-940f-65e2ab11c49d", - "metadata": {}, - "source": [ - "**Create the containers for the data**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "90486c1e-38bc-4c6f-bffe-b7e8f3be68ca", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Create an empty table if the analysis is stopped for lack of enough data\n", - "seasonality_cols <- c(seasonality_col, season_duration_col, season_start_month_col, cases_proportion_col)\n", - "empty_dt <- copy(admin_data)[, (seasonality_cols) := NA]" - ] - }, - { - "cell_type": "markdown", - "id": "b8da71be-45f1-405c-857c-ed86984988f4", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "markdown", + "id": "804a1bd1-26c8-4f6a-af35-3eba64fe0741", + "metadata": {}, + "source": [ + "## Globals and parameters" + ] }, - "tags": [] - }, - "source": [ - "## Preprocess input data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c5bf0faa-357e-44a7-af0c-04dd382af7e0", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "markdown", + "id": "414f9ee0-5264-43c4-992f-cff6c719d65c", + "metadata": {}, + "source": [ + "**Parameters**" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# format table\n", - "setDT(original_dt)\n", - "integer_cols <- c(year_col, month_col)\n", - "numeric_cols <- c(original_values_col)\n", - "original_dt[, (integer_cols) := lapply(.SD, as.integer), .SDcols = integer_cols]\n", - "# head(original_dt)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1a762ad-943e-467b-8cc1-e4998a996b9f", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "fb82560d-c123-4c54-bfa1-fb5f05e4ad69", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "minimum_periods <- as.integer(36)\n", + "maximum_proportion_missings_overall <- 0.1\n", + "maximum_proportion_missings_per_district <- 0.2" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# keep only the useful columns and aggregate the data on them\n", - "original_dt <- original_dt[,\n", - " setNames(list(sum(get(original_values_col), na.rm = TRUE)), original_values_col), \n", - " by = c(admin_id_col, period_cols)\n", - " ]\n", - "\n", - "num_periods <- make_cartesian_admin_period(original_dt, admin_id_col, year_col, month_col)[[1]]\n", - "all_rows <- make_cartesian_admin_period(original_dt, admin_id_col, year_col, month_col)[[2]]\n", - "\n", - "if (num_periods < minimum_periods){ \n", - " log_msg(glue(\"Data is not reliable: \n", - " at least {minimum_periods} year-month periods of data are required for the case analyais; \n", - " the data only contains {num_periods} periods. Abandoning analysis.\")\n", - " , level=\"error\")\n", - " stop(\"ERROR 1\")\n", - "}\n", - "\n", - "# inject the (possibly missing) rows into the data\n", - "original_dt <- make_full_time_space_data(\n", - " input_dt=original_dt,\n", - " full_rows_dt=all_rows,\n", - " target_colname=original_values_col,\n", - " admin_colname=admin_id_col,\n", - " year_colname=year_col,\n", - " month_colname=month_col)\n", - "\n", - "if(nrow(original_dt[is.na(get(original_values_col)),]) > (maximum_proportion_missings_overall * nrow(original_dt))){ \n", - " log_msg(\"There are too many missing values in the data overall. Abandoning analysis.\", level=\"error\")\n", - " stop(\"ERROR 2\") \n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "e3d793a5-ac96-4dcc-bd86-5837a631ea54", - "metadata": {}, - "source": [ - "### Imputation of missings" - ] - }, - { - "cell_type": "markdown", - "id": "1b7c767b-5343-4d7a-ad6a-aac11ee2ba12", - "metadata": {}, - "source": [ - "**Remove impute files (if any)**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ac3414c6-baf1-47f0-ad6d-5ff1cb0e432e", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Remove existing imputation files\n", - "filename_imputed_dt <- paste(COUNTRY_CODE, type_of_seasonality, 'imputed.csv', sep = '_')\n", - "files_in_folder <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", - "files_to_remove <- files_in_folder[grepl(filename_imputed_dt, basename(files_in_folder), ignore.case = TRUE)]\n", - "file.remove(files_to_remove)\n", - "print(glue(\"Deleted files: {str(files_to_remove)}\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8cf34a4f-f429-4ee5-9919-5e5c7abe9da6", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "9024226d-5845-48a0-8ae4-e7b5a8d11988", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "parameters" + ], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Fallback parameter values for local/dev execution\n", + "# When run via pipeline, these are injected by Papermill in the first cell\n", + "if (!exists(\"minimum_month_block_size\")) {\n", + " minimum_month_block_size <- as.integer(3)\n", + "}\n", + "if (!exists(\"maximum_month_block_size\")) {\n", + " maximum_month_block_size <- as.integer(5)\n", + "}\n", + "if (!exists(\"threshold_for_seasonality\")) {\n", + " threshold_for_seasonality <- 0.6\n", + "}\n", + "if (!exists(\"threshold_proportion_seasonal_years\")) {\n", + " threshold_proportion_seasonal_years <- 0.5\n", + "}\n", + "if (!exists(\"use_calendar_year_denominator\")) {\n", + " use_calendar_year_denominator <- FALSE\n", + "}\n", + "\n", + "# Ensure correct types\n", + "minimum_month_block_size <- as.integer(minimum_month_block_size)\n", + "maximum_month_block_size <- as.integer(maximum_month_block_size)\n", + "\n", + "# Log parameter values\n", + "log_msg(paste(\"Minimum month block size:\", minimum_month_block_size))\n", + "log_msg(paste(\"Maximum month block size:\", maximum_month_block_size))\n", + "log_msg(paste(\"Threshold for seasonality:\", threshold_for_seasonality))\n", + "log_msg(paste(\"Threshold proportion seasonal years:\", threshold_proportion_seasonal_years))\n", + "log_msg(paste(\"Use calendar year denominator:\", use_calendar_year_denominator))" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# create the name of the column which will store the imputed/estimated values\n", - "imputed_col = paste(original_values_col, 'EST', sep = '_')\n", - "\n", - "# if there are rows of missing data for cases, impute them (SARIMA)\n", - "if(nrow(original_dt[!is.na(get(original_values_col)),]) != nrow(original_dt)) {\n", - " log_msg(\"There is missing data. Proceeding to impute them.\", level=\"warning\")\n", - " \n", - " # extract data on only the administrative units which have missing values for original_values_col\n", - " missing_dt <- extract_dt_with_missings(original_dt, target_colname = original_values_col, id_colname = admin_id_col)\n", - " missing_dt <- missing_dt[, PERIOD := make_yearmonth(year = YEAR, month = MONTH)]\n", - " missing_dt <- missing_dt[, .SD, .SDcols = c(admin_id_col, 'PERIOD', original_values_col)]\n", - " \n", - " # how many rows missing for each administrative unit? if too many, then not good idea to impute\n", - " missings_by_admin_unit <- missing_dt[, .(missing_count = sum(is.na(get(original_values_col)))), by = admin_id_col][order(-missing_count)]\n", - " \n", - " # if for any given admin unit, more than a given % of data is missing, there's too much to impute (maybe should be stricter - to discuss)\n", - " if(missings_by_admin_unit[, max(missing_count)] > maximum_proportion_missings_per_district * num_periods){\n", - " log_msg(\"Some administrative units have too many missing values in the target data. Abandoning analysis.\", level=\"error\")\n", - " stop(\"ERROR 3\")\n", - " }\n", - " \n", - " # split to list per admin_unit_id, to apply SARIMA imputation on each time series (per admin unit)\n", - " missing_districts_list <- split(missing_dt, by = admin_id_col)\n", - " \n", - " # seasonal ARIMA to estimate missing cases: apply function to list of data.tables with missing rows, then create data.table from result\n", - " filled_missings_dt <- rbindlist(\n", - " lapply(missing_districts_list,\n", - " fill_missing_cases_ts,\n", - " original_values_colname=original_values_col,\n", - " estimated_values_colname=imputed_col,\n", - " admin_colname=admin_id_col,\n", - " period_colname='PERIOD',\n", - " threshold_for_missing = 0.0)\n", - " )\n", - " \n", - " # add the imputed (\"_EST\") values to the original data\n", - " imputed_dt <- merge.data.table(original_dt, filled_missings_dt[, .SD, .SDcols = !(original_values_col)], by = c(admin_id_col, year_col, month_col), all.x = TRUE)\n", - " \n", - " # copy from the districts without missings;\n", - " # if data is large, this could be made faster by only copying from the districts which are not in the missing_dt\n", - " imputed_dt[!is.na(get(original_values_col)), (imputed_col) := get(original_values_col)]\n", - "\n", - " # Save imputed file, only if it was computed (if there is missing data to impute)\n", - " safe_create_dir(INTERMEDIATE_RESULTS_PATH)\n", - " fwrite(imputed_dt, file = file.path(INTERMEDIATE_RESULTS_PATH, filename_imputed_dt))\n", - " \n", - "} else {\n", - " imputed_dt <- copy(original_dt)\n", - " imputed_dt[, (imputed_col) := get(original_values_col)]\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "9db44942-d844-491c-9045-906e99a37c60", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "markdown", + "id": "0b2d3bb6-6351-4f32-92de-44a6579b6630", + "metadata": {}, + "source": [ + "**Fixed routine formatting columns**" + ] }, - "tags": [] - }, - "source": [ - "## Seasonality" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9bfa3ebf-3a04-405a-9cc6-5f174a08f70b", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "90b27881-b25d-4cb3-8b2f-4dd1b395bdee", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Global variables\n", + "type_of_seasonality <- \"cases\"\n", + "formatted_threshold_for_seasonality <- sprintf(\"%d%%\", round(threshold_for_seasonality * 100))\n", + "data_source <- \"DHIS2\"\n", + "original_values_col <- \"CONF\"\n", + "\n", + "# space and time columns\n", + "admin_level <- 'ADM2'\n", + "admin_id_col <- paste(admin_level, toupper('id'), sep = '_')\n", + "admin_name_col <- paste(admin_level, toupper('name'), sep = '_')\n", + "year_col <- 'YEAR'\n", + "month_col <- 'MONTH'\n", + "period_cols <- c(year_col, month_col)" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Step 1: Compute month-level seasonality indicators\n", - "# For each row (period-admin unit), determine if it marks the start of a seasonal block\n", - "\n", - "row_seasonality_dt <- compute_month_seasonality(\n", - " input_dt=imputed_dt,\n", - " indicator=type_of_seasonality,\n", - " values_colname=imputed_col,\n", - " vector_of_durations=possible_month_block_sizes,\n", - " admin_colname=admin_id_col,\n", - " year_colname=year_col,\n", - " month_colname=month_col,\n", - " proportion_threshold=threshold_for_seasonality,\n", - " use_calendar_year_denominator=use_calendar_year_denominator\n", - ")\n", - "\n", - "# Create the filename\n", - "file_stem <- paste(COUNTRY_CODE, type_of_seasonality, 'row_seasonality', sep = '_')\n", - "filename_csv = glue(\"{file_stem}.csv\")\n", - "filename_parquet = glue(\"{file_stem}.parquet\")\n", - "fwrite(row_seasonality_dt, file.path(OUTPUT_DATA_PATH, filename_csv))\n", - "write_parquet(row_seasonality_dt, file.path(OUTPUT_DATA_PATH, filename_parquet))\n", - "\n", - "\n", - "# The seasonality per admin unit, irrespective of year ----------------------\n", - "\n", - "seasonality_source_dt <- process_seasonality(\n", - " input_dt=row_seasonality_dt,\n", - " indicator=type_of_seasonality,\n", - " vector_of_durations=possible_month_block_sizes,\n", - " admin_colname=admin_id_col,\n", - " year_colname=year_col,\n", - " month_colname=month_col,\n", - " proportion_seasonal_years_threshold=threshold_proportion_seasonal_years\n", - ")\n", - "\n", - "# Compute the duration block; there are normal warnings when it's only 0-es for seasonality:\n", - "# for those admin units without any seasonality, the duration of the block will be 'infinite')\n", - "check_pattern_seasonality <- paste(\"^SEASONALITY\", toupper(type_of_seasonality), \"[0-9]+_MTH$\", sep = \"_\")\n", - "seasonality_source_dt <- seasonality_source_dt[, .SD, .SDcols = c(admin_id_col, grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE))]" - ] - }, - { - "cell_type": "markdown", - "id": "d9f8270f-7283-4630-b9ba-62366b1c3e62", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "473308f4-4630-4d9e-82a9-b2b4fc9134db", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "possible_month_block_sizes <- as.integer(minimum_month_block_size:maximum_month_block_size)\n", + "formatted_threshold_for_seasonality <- sprintf(\"%d%%\", round(threshold_for_seasonality * 100))\n", + "print(paste(\"Formatted threshold :\",formatted_threshold_for_seasonality))" + ] }, - "tags": [] - }, - "source": [ - "## Result file" - ] - }, - { - "cell_type": "markdown", - "id": "477fb459-0f98-4a32-96ab-f10b4395495f", - "metadata": {}, - "source": [ - "### long" - ] - }, - { - "cell_type": "markdown", - "id": "db719aed-6347-48f4-8984-add9f8adec2d", - "metadata": {}, - "source": [ - "This format, until further notice, is not saved." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "50ba68f5-e9a9-4144-99dc-de8cc44770e9", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "markdown", + "id": "86f492f3-5634-4987-a2b8-23014aba5d51", + "metadata": {}, + "source": [ + "## Load data" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "seasonality_long_dt <- melt(\n", - " seasonality_source_dt,\n", - " id.vars = grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE, invert = TRUE), # all cols which don't follow the pattern\n", - " variable.name = 'MONTH_BLOCK_SIZE',\n", - " value.name =seasonality_col\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b2a400d4-a545-40ab-9204-b6b2e69ea499", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "623480ee-4310-4ead-a8c8-bf294527c814", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load spatial/routine inputs from dataset\n", + "seasonality_inputs <- load_seasonality_input_data(\n", + " dataset_name = dhis2_dataset,\n", + " country_code = COUNTRY_CODE\n", + ")\n", + "spatial_data <- seasonality_inputs$spatial_data" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "seasonality_long_dt[, MONTH_BLOCK_SIZE := possible_month_block_sizes[match(MONTH_BLOCK_SIZE, grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE))]]\n", - "\n", - "# add remaining admin unit columns and save the final results\n", - "admin_seasonality_long_dt <- merge.data.table(admin_data, seasonality_long_dt, by = c(admin_id_col), all = TRUE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7a7926d5-d4e7-4707-85b8-7020c3738be3", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# order the columns\n", - "specific_cols <- setdiff(names(admin_seasonality_long_dt), names(admin_data)) # last columns\n", - "admin_seasonality_long_dt <- admin_seasonality_long_dt[, .SD, .SDcols = c(common_cols, specific_cols)]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "343a07c2-2e0e-49eb-b487-cc8d6431e015", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "1f766ea1-dced-4143-a5be-fdc51da4bd8d", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "original_dt <- seasonality_inputs$original_dt" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Keeping for now.\n", - "# # filename_admin_seasonality_long_dt <- paste(COUNTRY_CODE, data_source, admin_level, gsub(\"\\\\.\", \"\", as.character(threshold_for_seasonality)), type_of_seasonality, 'seasonality_long.csv', sep = '_')\n", - "# filename_admin_seasonality_long_dt <- paste(COUNTRY_CODE, data_source, admin_level, type_of_seasonality, 'seasonality_long.csv', sep = '_')\n", - "# fwrite(admin_seasonality_long_dt, file.path(OUTPUT_DATA_PATH, filename_admin_seasonality_long_dt))" - ] - }, - { - "cell_type": "markdown", - "id": "36d1f9cb-75b6-4f6a-a18c-b34eb233b8d2", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "7b769deb-52e5-471d-9950-ac431dd8cf03", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Columns formatting\n", + "admin_data <- st_drop_geometry(spatial_data)\n", + "setDT(admin_data)\n", + "common_cols <- names(admin_data)\n", + "\n", + "seasonality_col <- glue('SEASONALITY', toupper(type_of_seasonality), .sep = \"_\")\n", + "season_duration_col <- glue('SEASONAL_BLOCK_DURATION', toupper(type_of_seasonality), .sep = \"_\")\n", + "season_start_month_col <- glue('SEASONAL_BLOCK_START_MONTH', toupper(type_of_seasonality), .sep = \"_\")\n", + "cases_proportion_col <- 'CASES_PROPORTION'\n", + "final_table_cols <- c(names(admin_data), seasonality_col, season_duration_col, season_start_month_col, cases_proportion_col)\n", + "print(final_table_cols)" + ] }, - "tags": [] - }, - "source": [ - "### Transform to wide format" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e71259c9-29a6-452f-8949-74adb0e62c1c", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "markdown", + "id": "0d329af2-f544-4ee2-940f-65e2ab11c49d", + "metadata": {}, + "source": [ + "**Create the containers for the data**" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "seasonality_wide_dt <- compute_min_seasonality_block(\n", - " input_dt=seasonality_source_dt,\n", - " seasonality_column_pattern=check_pattern_seasonality,\n", - " vector_of_possible_month_block_sizes=possible_month_block_sizes,\n", - " # indicator=toupper(type_of_seasonality),\n", - " seasonal_blocksize_colname=season_duration_col,\n", - " valid_value = 1\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9284c723-97d4-46f9-8837-7f70dae92a31", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "90486c1e-38bc-4c6f-bffe-b7e8f3be68ca", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Create an empty table if the analysis is stopped for lack of enough data\n", + "seasonality_cols <- c(seasonality_col, season_duration_col, season_start_month_col, cases_proportion_col)\n", + "empty_dt <- copy(admin_data)[, (seasonality_cols) := NA]" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Create a new, overall column 'SEASONALITY_' based on the values of columns in 'check_pattern_seasonality'\n", - "seasonality_pattern_cols <- grep(check_pattern_seasonality, names(seasonality_wide_dt), value = TRUE)\n", - "if (length(seasonality_pattern_cols) > 0L) {\n", - " seasonality_wide_dt <- seasonality_wide_dt[, (seasonality_col) := ifelse(rowSums(.SD == 1, na.rm = TRUE) > 0, 1L, 0L), .SDcols = seasonality_pattern_cols]\n", - " seasonality_wide_dt <- seasonality_wide_dt[, (seasonality_pattern_cols) := NULL]\n", - "} else {\n", - " seasonality_wide_dt[, (seasonality_col) := NA_integer_]\n", - "}\n", - "\n", - "# Compute CASES_PROPORTION: proportion of cases in the seasonal block vs ANNUAL total\n", - "# Only for seasonal admin units (SEASONALITY_CASES = 1)\n", - "\n", - "# Step 1: Compute annual totals per admin-year from imputed data\n", - "annual_totals_dt <- imputed_dt[, .(ANNUAL_TOTAL = sum(get(imputed_col), na.rm = TRUE)), by = c(admin_id_col, year_col)]\n", - "\n", - "# Step 2: Function to compute proportion = max block sum / annual total\n", - "compute_cases_proportion <- function(admin_id, block_duration, row_data, annual_data, admin_col, year_column) {\n", - " if (is.na(block_duration) || is.infinite(block_duration)) return(NA_real_)\n", - " \n", - " # Column with block sum (N-month forward-looking sum)\n", - " sum_col <- paste('CASES_SUM', block_duration, 'MTH_FW', sep = '_')\n", - " if (!sum_col %in% names(row_data)) return(NA_real_)\n", - " \n", - " admin_row_data <- row_data[get(admin_col) == admin_id]\n", - " admin_annual_data <- annual_data[get(admin_col) == admin_id]\n", - " if (nrow(admin_row_data) == 0 || nrow(admin_annual_data) == 0) return(NA_real_)\n", - " \n", - " # For each year, get max block sum (only if there are non-NA values)\n", - " yearly_max_block <- admin_row_data[\n", - " !is.na(get(sum_col)),\n", - " .(max_block_sum = if (.N > 0L) max(get(sum_col), na.rm = TRUE) else NA_real_),\n", - " by = year_column\n", - " ]\n", - " \n", - " # Remove rows with NA or -Inf (from max when all values were NA)\n", - " yearly_max_block <- yearly_max_block[is.finite(max_block_sum)]\n", - " if (nrow(yearly_max_block) == 0) return(NA_real_)\n", - " \n", - " # Merge with annual totals\n", - " merged <- merge(yearly_max_block, admin_annual_data, by = year_column)\n", - " merged <- merged[ANNUAL_TOTAL > 0]\n", - " if (nrow(merged) == 0) return(NA_real_)\n", - " \n", - " # Proportion = block sum / annual total, then average across years\n", - " merged[, prop := max_block_sum / ANNUAL_TOTAL]\n", - " return(mean(merged$prop, na.rm = TRUE))\n", - "}\n", - "\n", - "seasonality_wide_dt[, (cases_proportion_col) := mapply(\n", - " compute_cases_proportion,\n", - " admin_id = get(admin_id_col),\n", - " block_duration = get(season_duration_col),\n", - " MoreArgs = list(row_data = row_seasonality_dt, annual_data = annual_totals_dt, admin_col = admin_id_col, year_column = year_col)\n", - ")]\n", - "\n", - "# Set CASES_PROPORTION to NA for non-seasonal admin units\n", - "seasonality_wide_dt[get(seasonality_col) == 0 | is.na(get(seasonality_col)), (cases_proportion_col) := NA_real_]\n", - "\n", - "# Compute SEASONAL_BLOCK_START_MONTH: first month of the seasonal block\n", - "# Only for seasonal admin units (SEASONALITY_CASES = 1)\n", - "\n", - "# Function to find the most frequent starting month for a given admin unit and block duration\n", - "compute_start_month <- function(admin_id, block_duration, row_data, admin_col, year_column, month_column) {\n", - " if (is.na(block_duration) || is.infinite(block_duration)) return(NA_integer_)\n", - " \n", - " # Column with row-level seasonality indicator for this block duration\n", - " seasonality_row_col <- paste('CASES', block_duration, 'MTH_ROW_SEASONALITY', sep = '_')\n", - " if (!seasonality_row_col %in% names(row_data)) return(NA_integer_)\n", - " \n", - " admin_row_data <- row_data[get(admin_col) == admin_id]\n", - " if (nrow(admin_row_data) == 0) return(NA_integer_)\n", - " \n", - " # Filter rows where seasonality = 1 (this month is the start of a seasonal block)\n", - " seasonal_months <- admin_row_data[get(seasonality_row_col) == 1, get(month_column)]\n", - " \n", - " if (length(seasonal_months) == 0) return(NA_integer_)\n", - " \n", - " # Find the most frequent month (mode)\n", - " month_counts <- table(seasonal_months)\n", - " most_frequent_month <- as.integer(names(month_counts)[which.max(month_counts)])\n", - " \n", - " return(most_frequent_month)\n", - "}\n", - "\n", - "seasonality_wide_dt[, (season_start_month_col) := mapply(\n", - " compute_start_month,\n", - " admin_id = get(admin_id_col),\n", - " block_duration = get(season_duration_col),\n", - " MoreArgs = list(row_data = row_seasonality_dt, admin_col = admin_id_col, year_column = year_col, month_column = month_col)\n", - ")]\n", - "\n", - "# Set SEASONAL_BLOCK_START_MONTH to NA for non-seasonal admin units\n", - "seasonality_wide_dt[get(seasonality_col) == 0 | is.na(get(seasonality_col)), (season_start_month_col) := NA_integer_]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c5ea18ef-1148-432d-a954-b9c6b4a06afc", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "markdown", + "id": "b8da71be-45f1-405c-857c-ed86984988f4", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Preprocess input data" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# add remaining admin unit columns and save the final results\n", - "admin_seasonality_wide_dt <- merge.data.table(admin_data, seasonality_wide_dt, by = c(admin_id_col), all = TRUE)\n", - "admin_seasonality_wide_dt <- admin_seasonality_wide_dt[, .SD, .SDcols = c(common_cols, seasonality_cols)]\n", - "# head(admin_seasonality_wide_dt)" - ] - }, - { - "cell_type": "markdown", - "id": "a2f1a373-4b34-42db-b591-b25c7050dee6", - "metadata": {}, - "source": [ - "**Save output**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ded1e86d-f4c5-4fdb-a6ba-611d5c7f4aed", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" + { + "cell_type": "code", + "execution_count": null, + "id": "c5bf0faa-357e-44a7-af0c-04dd382af7e0", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# format table\n", + "setDT(original_dt)\n", + "integer_cols <- c(year_col, month_col)\n", + "numeric_cols <- c(original_values_col)\n", + "original_dt[, (integer_cols) := lapply(.SD, as.integer), .SDcols = integer_cols]\n", + "# head(original_dt)" + ] }, - "tags": [], - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Create the filename\n", - "file_stem <- paste(COUNTRY_CODE, type_of_seasonality, 'seasonality', sep = '_')\n", - "filename_csv = glue(\"{file_stem}.csv\")\n", - "filename_parquet = glue(\"{file_stem}.parquet\")\n", - "fwrite(admin_seasonality_wide_dt, file.path(OUTPUT_DATA_PATH, filename_csv))\n", - "write_parquet(admin_seasonality_wide_dt, file.path(OUTPUT_DATA_PATH, filename_parquet))\n", - "log_msg(paste0(\"Case seasonality results saved in folder \", OUTPUT_DATA_PATH))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9533db1c-a9db-42be-a4cd-4076d9f212ba", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# fwrite(row_seasonality_dt, file.path(OUTPUT_DATA_PATH, \"row_seasonality.csv\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4b39e53f-f4fe-49e4-8e4d-9a6679313a54", - "metadata": { - "vscode": { - "languageId": "r" + { + "cell_type": "code", + "execution_count": null, + "id": "a1a762ad-943e-467b-8cc1-e4998a996b9f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# keep only the useful columns and aggregate the data on them\n", + "original_dt <- original_dt[,\n", + " setNames(list(sum(get(original_values_col), na.rm = TRUE)), original_values_col), \n", + " by = c(admin_id_col, period_cols)\n", + " ]\n", + "\n", + "num_periods <- make_cartesian_admin_period(original_dt, admin_id_col, year_col, month_col)[[1]]\n", + "all_rows <- make_cartesian_admin_period(original_dt, admin_id_col, year_col, month_col)[[2]]\n", + "\n", + "if (num_periods < minimum_periods){ \n", + " log_msg(glue(\"Data is not reliable: \n", + " at least {minimum_periods} year-month periods of data are required for the case analyais; \n", + " the data only contains {num_periods} periods. Abandoning analysis.\")\n", + " , level=\"error\")\n", + " stop(\"ERROR 1\")\n", + "}\n", + "\n", + "# inject the (possibly missing) rows into the data\n", + "original_dt <- make_full_time_space_data(\n", + " input_dt=original_dt,\n", + " full_rows_dt=all_rows,\n", + " target_colname=original_values_col,\n", + " admin_colname=admin_id_col,\n", + " year_colname=year_col,\n", + " month_colname=month_col)\n", + "\n", + "if(nrow(original_dt[is.na(get(original_values_col)),]) > (maximum_proportion_missings_overall * nrow(original_dt))){ \n", + " log_msg(\"There are too many missing values in the data overall. Abandoning analysis.\", level=\"error\")\n", + " stop(\"ERROR 2\") \n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "e3d793a5-ac96-4dcc-bd86-5837a631ea54", + "metadata": {}, + "source": [ + "### Imputation of missings" + ] + }, + { + "cell_type": "markdown", + "id": "1b7c767b-5343-4d7a-ad6a-aac11ee2ba12", + "metadata": {}, + "source": [ + "**Remove impute files (if any)**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac3414c6-baf1-47f0-ad6d-5ff1cb0e432e", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Remove existing imputation files\n", + "filename_imputed_dt <- paste(COUNTRY_CODE, type_of_seasonality, 'imputed.csv', sep = '_')\n", + "files_in_folder <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)\n", + "files_to_remove <- files_in_folder[grepl(filename_imputed_dt, basename(files_in_folder), ignore.case = TRUE)]\n", + "file.remove(files_to_remove)\n", + "print(glue(\"Deleted files: {str(files_to_remove)}\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8cf34a4f-f429-4ee5-9919-5e5c7abe9da6", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# create the name of the column which will store the imputed/estimated values\n", + "imputed_col = paste(original_values_col, 'EST', sep = '_')\n", + "\n", + "# if there are rows of missing data for cases, impute them (SARIMA)\n", + "if(nrow(original_dt[!is.na(get(original_values_col)),]) != nrow(original_dt)) {\n", + " log_msg(\"There is missing data. Proceeding to impute them.\", level=\"warning\")\n", + " \n", + " # extract data on only the administrative units which have missing values for original_values_col\n", + " missing_dt <- extract_dt_with_missings(original_dt, target_colname = original_values_col, id_colname = admin_id_col)\n", + " missing_dt <- missing_dt[, PERIOD := make_yearmonth(year = YEAR, month = MONTH)]\n", + " missing_dt <- missing_dt[, .SD, .SDcols = c(admin_id_col, 'PERIOD', original_values_col)]\n", + " \n", + " # how many rows missing for each administrative unit? if too many, then not good idea to impute\n", + " missings_by_admin_unit <- missing_dt[, .(missing_count = sum(is.na(get(original_values_col)))), by = admin_id_col][order(-missing_count)]\n", + " \n", + " # if for any given admin unit, more than a given % of data is missing, there's too much to impute (maybe should be stricter - to discuss)\n", + " if(missings_by_admin_unit[, max(missing_count)] > maximum_proportion_missings_per_district * num_periods){\n", + " log_msg(\"Some administrative units have too many missing values in the target data. Abandoning analysis.\", level=\"error\")\n", + " stop(\"ERROR 3\")\n", + " }\n", + " \n", + " # split to list per admin_unit_id, to apply SARIMA imputation on each time series (per admin unit)\n", + " missing_districts_list <- split(missing_dt, by = admin_id_col)\n", + " \n", + " # seasonal ARIMA to estimate missing cases: apply function to list of data.tables with missing rows, then create data.table from result\n", + " filled_missings_dt <- rbindlist(\n", + " lapply(missing_districts_list,\n", + " fill_missing_cases_ts,\n", + " original_values_colname=original_values_col,\n", + " estimated_values_colname=imputed_col,\n", + " admin_colname=admin_id_col,\n", + " period_colname='PERIOD',\n", + " threshold_for_missing = 0.0)\n", + " )\n", + " \n", + " # add the imputed (\"_EST\") values to the original data\n", + " imputed_dt <- merge.data.table(original_dt, filled_missings_dt[, .SD, .SDcols = !(original_values_col)], by = c(admin_id_col, year_col, month_col), all.x = TRUE)\n", + " \n", + " # copy from the districts without missings;\n", + " # if data is large, this could be made faster by only copying from the districts which are not in the missing_dt\n", + " imputed_dt[!is.na(get(original_values_col)), (imputed_col) := get(original_values_col)]\n", + "\n", + " # Save imputed file, only if it was computed (if there is missing data to impute)\n", + " safe_create_dir(INTERMEDIATE_RESULTS_PATH)\n", + " fwrite(imputed_dt, file = file.path(INTERMEDIATE_RESULTS_PATH, filename_imputed_dt))\n", + " \n", + "} else {\n", + " imputed_dt <- copy(original_dt)\n", + " imputed_dt[, (imputed_col) := get(original_values_col)]\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "9db44942-d844-491c-9045-906e99a37c60", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Seasonality" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9bfa3ebf-3a04-405a-9cc6-5f174a08f70b", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Step 1: Compute month-level seasonality indicators\n", + "# For each row (period-admin unit), determine if it marks the start of a seasonal block\n", + "\n", + "row_seasonality_dt <- compute_month_seasonality(\n", + " input_dt=imputed_dt,\n", + " indicator=type_of_seasonality,\n", + " values_colname=imputed_col,\n", + " vector_of_durations=possible_month_block_sizes,\n", + " admin_colname=admin_id_col,\n", + " year_colname=year_col,\n", + " month_colname=month_col,\n", + " proportion_threshold=threshold_for_seasonality,\n", + " use_calendar_year_denominator=use_calendar_year_denominator\n", + ")\n", + "\n", + "# Create the filename\n", + "file_stem <- paste(COUNTRY_CODE, type_of_seasonality, 'row_seasonality', sep = '_')\n", + "filename_csv = glue(\"{file_stem}.csv\")\n", + "filename_parquet = glue(\"{file_stem}.parquet\")\n", + "fwrite(row_seasonality_dt, file.path(OUTPUT_DATA_PATH, filename_csv))\n", + "write_parquet(row_seasonality_dt, file.path(OUTPUT_DATA_PATH, filename_parquet))\n", + "\n", + "\n", + "# The seasonality per admin unit, irrespective of year ----------------------\n", + "\n", + "seasonality_source_dt <- process_seasonality(\n", + " input_dt=row_seasonality_dt,\n", + " indicator=type_of_seasonality,\n", + " vector_of_durations=possible_month_block_sizes,\n", + " admin_colname=admin_id_col,\n", + " year_colname=year_col,\n", + " month_colname=month_col,\n", + " proportion_seasonal_years_threshold=threshold_proportion_seasonal_years\n", + ")\n", + "\n", + "# Compute the duration block; there are normal warnings when it's only 0-es for seasonality:\n", + "# for those admin units without any seasonality, the duration of the block will be 'infinite')\n", + "check_pattern_seasonality <- paste(\"^SEASONALITY\", toupper(type_of_seasonality), \"[0-9]+_MTH$\", sep = \"_\")\n", + "seasonality_source_dt <- seasonality_source_dt[, .SD, .SDcols = c(admin_id_col, grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE))]" + ] + }, + { + "cell_type": "markdown", + "id": "d9f8270f-7283-4630-b9ba-62366b1c3e62", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Result file" + ] + }, + { + "cell_type": "markdown", + "id": "477fb459-0f98-4a32-96ab-f10b4395495f", + "metadata": {}, + "source": [ + "### long" + ] + }, + { + "cell_type": "markdown", + "id": "db719aed-6347-48f4-8984-add9f8adec2d", + "metadata": {}, + "source": [ + "This format, until further notice, is not saved." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50ba68f5-e9a9-4144-99dc-de8cc44770e9", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "seasonality_long_dt <- melt(\n", + " seasonality_source_dt,\n", + " id.vars = grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE, invert = TRUE), # all cols which don't follow the pattern\n", + " variable.name = 'MONTH_BLOCK_SIZE',\n", + " value.name =seasonality_col\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2a400d4-a545-40ab-9204-b6b2e69ea499", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "seasonality_long_dt[, MONTH_BLOCK_SIZE := possible_month_block_sizes[match(MONTH_BLOCK_SIZE, grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE))]]\n", + "\n", + "# add remaining admin unit columns and save the final results\n", + "admin_seasonality_long_dt <- merge.data.table(admin_data, seasonality_long_dt, by = c(admin_id_col), all = TRUE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a7926d5-d4e7-4707-85b8-7020c3738be3", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# order the columns\n", + "specific_cols <- setdiff(names(admin_seasonality_long_dt), names(admin_data)) # last columns\n", + "admin_seasonality_long_dt <- admin_seasonality_long_dt[, .SD, .SDcols = c(common_cols, specific_cols)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "343a07c2-2e0e-49eb-b487-cc8d6431e015", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Keeping for now.\n", + "# # filename_admin_seasonality_long_dt <- paste(COUNTRY_CODE, data_source, admin_level, gsub(\"\\\\.\", \"\", as.character(threshold_for_seasonality)), type_of_seasonality, 'seasonality_long.csv', sep = '_')\n", + "# filename_admin_seasonality_long_dt <- paste(COUNTRY_CODE, data_source, admin_level, type_of_seasonality, 'seasonality_long.csv', sep = '_')\n", + "# fwrite(admin_seasonality_long_dt, file.path(OUTPUT_DATA_PATH, filename_admin_seasonality_long_dt))" + ] + }, + { + "cell_type": "markdown", + "id": "36d1f9cb-75b6-4f6a-a18c-b34eb233b8d2", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "### Transform to wide format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e71259c9-29a6-452f-8949-74adb0e62c1c", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "seasonality_wide_dt <- compute_min_seasonality_block(\n", + " input_dt=seasonality_source_dt,\n", + " seasonality_column_pattern=check_pattern_seasonality,\n", + " vector_of_possible_month_block_sizes=possible_month_block_sizes,\n", + " # indicator=toupper(type_of_seasonality),\n", + " seasonal_blocksize_colname=season_duration_col,\n", + " valid_value = 1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9284c723-97d4-46f9-8837-7f70dae92a31", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Create a new, overall column 'SEASONALITY_' based on the values of columns in 'check_pattern_seasonality'\n", + "seasonality_pattern_cols <- grep(check_pattern_seasonality, names(seasonality_wide_dt), value = TRUE)\n", + "if (length(seasonality_pattern_cols) > 0L) {\n", + " seasonality_wide_dt <- seasonality_wide_dt[, (seasonality_col) := ifelse(rowSums(.SD == 1, na.rm = TRUE) > 0, 1L, 0L), .SDcols = seasonality_pattern_cols]\n", + " seasonality_wide_dt <- seasonality_wide_dt[, (seasonality_pattern_cols) := NULL]\n", + "} else {\n", + " seasonality_wide_dt[, (seasonality_col) := NA_integer_]\n", + "}\n", + "\n", + "# Compute CASES_PROPORTION: proportion of cases in the seasonal block vs ANNUAL total\n", + "# Only for seasonal admin units (SEASONALITY_CASES = 1)\n", + "annual_totals_dt <- imputed_dt[, .(ANNUAL_TOTAL = sum(get(imputed_col), na.rm = TRUE)), by = c(admin_id_col, year_col)]\n", + "\n", + "seasonality_wide_dt[, (cases_proportion_col) := mapply(\n", + " compute_cases_proportion,\n", + " admin_id = get(admin_id_col),\n", + " block_duration = get(season_duration_col),\n", + " MoreArgs = list(\n", + " row_data = row_seasonality_dt,\n", + " annual_data = annual_totals_dt,\n", + " admin_col = admin_id_col,\n", + " year_column = year_col\n", + " )\n", + ")]\n", + "\n", + "# Set CASES_PROPORTION to NA for non-seasonal admin units\n", + "seasonality_wide_dt[get(seasonality_col) == 0 | is.na(get(seasonality_col)), (cases_proportion_col) := NA_real_]\n", + "\n", + "# Compute SEASONAL_BLOCK_START_MONTH: first month of the seasonal block\n", + "# Only for seasonal admin units (SEASONALITY_CASES = 1)\n", + "seasonality_wide_dt[, (season_start_month_col) := mapply(\n", + " compute_start_month,\n", + " admin_id = get(admin_id_col),\n", + " block_duration = get(season_duration_col),\n", + " MoreArgs = list(\n", + " row_data = row_seasonality_dt,\n", + " admin_col = admin_id_col,\n", + " month_column = month_col\n", + " )\n", + ")]\n", + "\n", + "# Set SEASONAL_BLOCK_START_MONTH to NA for non-seasonal admin units\n", + "seasonality_wide_dt[get(seasonality_col) == 0 | is.na(get(seasonality_col)), (season_start_month_col) := NA_integer_]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5ea18ef-1148-432d-a954-b9c6b4a06afc", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# add remaining admin unit columns and save the final results\n", + "admin_seasonality_wide_dt <- merge.data.table(admin_data, seasonality_wide_dt, by = c(admin_id_col), all = TRUE)\n", + "admin_seasonality_wide_dt <- admin_seasonality_wide_dt[, .SD, .SDcols = c(common_cols, seasonality_cols)]\n", + "# head(admin_seasonality_wide_dt)" + ] + }, + { + "cell_type": "markdown", + "id": "a2f1a373-4b34-42db-b591-b25c7050dee6", + "metadata": {}, + "source": [ + "**Save output**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ded1e86d-f4c5-4fdb-a6ba-611d5c7f4aed", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Create the filename\n", + "file_stem <- paste(COUNTRY_CODE, type_of_seasonality, 'seasonality', sep = '_')\n", + "filename_csv = glue(\"{file_stem}.csv\")\n", + "filename_parquet = glue(\"{file_stem}.parquet\")\n", + "fwrite(admin_seasonality_wide_dt, file.path(OUTPUT_DATA_PATH, filename_csv))\n", + "write_parquet(admin_seasonality_wide_dt, file.path(OUTPUT_DATA_PATH, filename_parquet))\n", + "log_msg(paste0(\"Case seasonality results saved in folder \", OUTPUT_DATA_PATH))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9533db1c-a9db-42be-a4cd-4076d9f212ba", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# fwrite(row_seasonality_dt, file.path(OUTPUT_DATA_PATH, \"row_seasonality.csv\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b39e53f-f4fe-49e4-8e4d-9a6679313a54", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# fwrite(seasonality_source_dt, file.path(OUTPUT_DATA_PATH, \"processed_seasonality.csv\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6becf7c6-a821-4ed0-a91f-3b8dc2da6313", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] } - }, - "outputs": [], - "source": [ - "# fwrite(seasonality_source_dt, file.path(OUTPUT_DATA_PATH, \"processed_seasonality.csv\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6becf7c6-a821-4ed0-a91f-3b8dc2da6313", - "metadata": { - "vscode": { - "languageId": "r" + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" } - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_seasonality_cases/utils/snt_seasonality_cases.r b/pipelines/snt_seasonality_cases/utils/snt_seasonality_cases.r new file mode 100644 index 0000000..5a50ee2 --- /dev/null +++ b/pipelines/snt_seasonality_cases/utils/snt_seasonality_cases.r @@ -0,0 +1,170 @@ +#' Bootstrap runtime context for seasonality-cases pipeline. +#' +#' Loads shared helpers/packages, initializes OpenHEXA SDK handle, and creates +#' output/intermediate directories used by the notebook workflow. +#' +#' @param root_path Root workspace path. +#' @param required_packages Character vector of required R packages. +#' @param load_openhexa Whether to import OpenHEXA SDK. +#' @return Named list with paths and OpenHEXA handle. +bootstrap_seasonality_cases_context <- function( + root_path = "~/workspace", + required_packages = c( + "jsonlite", "data.table", "ggplot2", "fpp3", "arrow", "glue", + "sf", "RColorBrewer", "httr", "reticulate" + ), + load_openhexa = TRUE +) { + code_path <- file.path(root_path, "code") + config_path <- file.path(root_path, "configuration") + output_data_path <- file.path(root_path, "data", "seasonality_cases") + intermediate_results_path <- file.path(output_data_path, "intermediate_results") + dir.create(output_data_path, recursive = TRUE, showWarnings = FALSE) + dir.create(intermediate_results_path, recursive = TRUE, showWarnings = FALSE) + + source(file.path(code_path, "snt_utils.r")) + install_and_load(required_packages) + + Sys.setenv(PROJ_LIB = "/opt/conda/share/proj") + Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal") + Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python") + + openhexa <- NULL + if (load_openhexa) { + openhexa <- reticulate::import("openhexa.sdk") + } + assign("openhexa", openhexa, envir = .GlobalEnv) + + list( + ROOT_PATH = root_path, + CODE_PATH = code_path, + CONFIG_PATH = config_path, + OUTPUT_DATA_PATH = output_data_path, + INTERMEDIATE_RESULTS_PATH = intermediate_results_path, + openhexa = openhexa + ) +} + +#' Load routine and spatial inputs for seasonality-cases. +#' +#' Downloads country-specific `*_shapes.geojson` and `*_routine.parquet` from +#' the configured dataset, with explicit logging and stop-on-error behavior. +#' +#' @param dataset_name Source dataset identifier/name. +#' @param country_code Country code used in filenames. +#' @return List with `spatial_data` and `original_dt`. +load_seasonality_input_data <- function(dataset_name, country_code) { + spatial_filename <- paste(country_code, "shapes.geojson", sep = "_") + routine_filename <- paste(country_code, "routine.parquet", sep = "_") + + spatial_data <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_name, spatial_filename) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading DHIS2 shapes file for {country_code}: {conditionMessage(e)}") + log_msg(msg, level = "error") + stop(msg) + } + ) + log_msg(glue::glue("File {spatial_filename} successfully loaded from dataset version: {dataset_name}")) + + original_dt <- tryCatch( + { + get_latest_dataset_file_in_memory(dataset_name, routine_filename) + }, + error = function(e) { + msg <- glue::glue("[ERROR] Error while loading DHIS2 routine file for {country_code}: {conditionMessage(e)}") + log_msg(msg, level = "error") + stop(msg) + } + ) + log_msg(glue::glue("File {routine_filename} successfully loaded from dataset version: {dataset_name}")) + + list(spatial_data = spatial_data, original_dt = original_dt) +} + +#' Compute mean seasonal block proportion of annual cases. +#' +#' For each year and admin unit, computes the maximum forward block sum for the +#' selected duration, divides by annual total cases, and returns the mean ratio. +#' +#' @param admin_id Target administrative unit id. +#' @param block_duration Seasonal block duration in months. +#' @param row_data Row-level seasonality computations. +#' @param annual_data Annual totals by admin/year. +#' @param admin_col Column name for administrative unit id. +#' @param year_column Column name for year. +#' @return Numeric proportion (or NA when not computable). +compute_cases_proportion <- function(admin_id, block_duration, row_data, annual_data, admin_col, year_column) { + if (is.na(block_duration) || is.infinite(block_duration)) { + return(NA_real_) + } + + sum_col <- paste("CASES_SUM", block_duration, "MTH_FW", sep = "_") + if (!sum_col %in% names(row_data)) { + return(NA_real_) + } + + admin_row_data <- row_data[get(admin_col) == admin_id] + admin_annual_data <- annual_data[get(admin_col) == admin_id] + if (nrow(admin_row_data) == 0 || nrow(admin_annual_data) == 0) { + return(NA_real_) + } + + yearly_max_block <- admin_row_data[ + !is.na(get(sum_col)), + .(max_block_sum = if (.N > 0L) max(get(sum_col), na.rm = TRUE) else NA_real_), + by = year_column + ] + + yearly_max_block <- yearly_max_block[is.finite(max_block_sum)] + if (nrow(yearly_max_block) == 0) { + return(NA_real_) + } + + merged <- merge(yearly_max_block, admin_annual_data, by = year_column) + merged <- merged[ANNUAL_TOTAL > 0] + if (nrow(merged) == 0) { + return(NA_real_) + } + + merged[, prop := max_block_sum / ANNUAL_TOTAL] + mean(merged$prop, na.rm = TRUE) +} + + +#' Compute dominant seasonal start month for an admin unit. +#' +#' Uses row-level seasonality flags for a selected block duration and returns +#' the most frequent month marked as seasonal. +#' +#' @param admin_id Target administrative unit id. +#' @param block_duration Seasonal block duration in months. +#' @param row_data Row-level seasonality computations. +#' @param admin_col Column name for administrative unit id. +#' @param month_column Column name for month. +#' @return Integer month (1-12) or NA when not computable. +compute_start_month <- function(admin_id, block_duration, row_data, admin_col, month_column) { + if (is.na(block_duration) || is.infinite(block_duration)) { + return(NA_integer_) + } + + seasonality_row_col <- paste("CASES", block_duration, "MTH_ROW_SEASONALITY", sep = "_") + if (!seasonality_row_col %in% names(row_data)) { + return(NA_integer_) + } + + admin_row_data <- row_data[get(admin_col) == admin_id] + if (nrow(admin_row_data) == 0) { + return(NA_integer_) + } + + seasonal_months <- admin_row_data[get(seasonality_row_col) == 1, get(month_column)] + if (length(seasonal_months) == 0) { + return(NA_integer_) + } + + month_counts <- table(seasonal_months) + as.integer(names(month_counts)[which.max(month_counts)]) +}