Skip to main content

Code Documentation

1. Preparation

import os
import re
import pandas as pd
import numpy as np
import difflib
OUTPUT_DIR = '../output_data'

1.A. Data Input and Output

JMP_INPUT_FILE = '../input_data/JMP/jmp.csv'
JMP_OUTPUT_FILE = f'{OUTPUT_DIR}/table_jmp.csv'
IFS_INPUT_DIR = '../input_data/IFs'
IFS_OUTPUT_FILE = f'{OUTPUT_DIR}/table_ifs.csv'

1.B. Common Functions

Common functions are a collection of functions used by both data sources (IFS and JMP).

def merge_id(prev_table, keys_table, name):
    merged_df = prev_table.merge(keys_table, left_on=name, right_on=name, how='left')
    merged_df = merged_df.rename(columns={'id': f'{name}_id'})
    merged_df = merged_df.drop(columns=[name])
    merged_df[f'{name}_id'] = merged_df[f'{name}_id'].where(merged_df[f'{name}_id'].notna(), 0).astype(int)
    return merged_df
def cleanup_semicolon(source):
    with open(source, 'r') as file:
        content = file.read()
    updated_content = content.replace(';', '')
    with open(source, 'w') as file:
        file.write(updated_content)
def replace_bs_with_alb(x):
    return "ALB" if x == "BS" else x

1.C. Key Table Generator

This function generates a unique key table for a specified column from both IFS and JMP table, saving the keys to a CSV file. If the CSV file already exists, it appends new values to the existing file while ensuring unique IDs for each entry.

def create_table_key(dataframe, column):
    file_path = f'{OUTPUT_DIR}/key_{column}.csv'
    new_table = pd.DataFrame(
        dataframe[column].unique(),
        columns=[column]
    ).dropna().sort_values(column).reset_index(drop=True)
    
    # If the file already exists, load it
    if os.path.exists(file_path):
        existing_table = pd.read_csv(file_path)
        # Find the new values that are not in the existing table
        new_values = new_table[~new_table[column].isin(existing_table[column])]
        if not new_values.empty:
            # Assign IDs to the new values, starting after the max existing ID
            max_id = existing_table['id'].max()
            new_values['id'] = range(max_id + 1, max_id + 1 + len(new_values))
            # Append the new values to the existing table
            updated_table = pd.concat([existing_table, new_values], ignore_index=True)
        else:
            updated_table = existing_table  # No new values to add, keep existing table as is
    else:
        # If the file doesn't exist, create new IDs starting from 1
        new_table['id'] = range(1, len(new_table) + 1)
        updated_table = new_table
    updated_table[['id', column]].to_csv(file_path, index=False)
    return updated_table

1.D. Country Mapping

This section compares two lists of country names—jmp_country_list from the JMP dataset and ifs_country_list from the IFS dataset—and finds the closest matches using string similarity. It also includes a mapping for countries with naming differences between the two lists.

data_jmp = pd.read_csv(JMP_INPUT_FILE, encoding='latin-1')
jmp_country_list = list(data_jmp["COUNTRY, AREA OR TERRITORY"].unique())
ifs_country_list = ['All countries WHHS Tool1','Congo Dem. Republic of the','Ethiopia','Ghana','Guatemala','Haiti','India',
                    'Indonesia','Kenya','Liberia','Madagascar','Malawi','Mali','Mozambique','Nepal','Nigeria','Philippines',
                    'Rwanda','Senegal','Sudan South','Tanzania','Uganda','Zambia']
# Find the closest match
for country in ifs_country_list:
    probability = difflib.get_close_matches(country, jmp_country_list, n=3, cutoff=0.4)
    if probability:
        if country not in probability:
            print(f"{country} -> {list(probability)}")
    else:
        print(f"NOT FOUND: {country}")
NOT FOUND: All countries WHHS Tool1
Congo Dem. Republic of the -> ['Democratic Republic of the Congo', 'Republic of Korea', 'Iran (Islamic Republic of)']
Sudan South -> ['Sudan', 'San Marino', 'South Sudan']
Tanzania -> ['Panama', 'Canada', 'Mauritania']
country_mapping = {
    "All countries WHHS Tool1": "All Countries",
    "United Republic of Tanzania": "Tanzania",
    "Congo Dem. Republic of the": "Democratic Republic of the Congo",
    "Sudan South": "South Sudan",
}
def map_country_name(country):
    return country_mapping.get(country, country)

3. IFS Dataset

final_columns = ['indicator','year','country','unit','value_name','jmp_category','commitment','value']
files_to_keep = [
    "01. Deaths by Category of Cause - Millions (2nd Dimensions = Diarrhea).csv",
    "06. Poverty Headcount less than $2.15 per Day, Log Normal - Millions.csv",
    "08. State Failure Instability Event - IFs Index.csv",
    "11. Governance Effectiveness - WB index.csv",
    # "12. Value Added by Sector, Currency - Billion dollars.csv",
    "13. Sanitation Services, Access, percent of population (2nd Dimensions = Basic + Safely Managed).csv",
    "14. Sanitation Services, Access, Number of people, million (2nd Dimensions = Basic + Safely Managed).csv",
    "15. Sanitation Services, Expenditure, Capital, Billion $ (2nd Dimensions = Basic + Safely Managed).csv",
    "16. Sanitation Services, Expenditure, Maintenance, Billion $ (2nd Dimensions = Basic + Safely Managed).csv",
    "17. Water Services, Access, percent of population (2nd Dimension = Basic + Safely Managed).csv",
    "18. Water Services, Access, Number of people, million (2nd Dimensions = Basic + Safely Managed).csv",
    "19. Water Services, Expenditure, Capital, Billion $ (2nd Dimensions = Basic + Safely Managed).csv",
    "20. Water Services, Expenditure, Maintenance, Billion $ (2nd Dimensions = Basic + Safely Managed).csv",
    # "21. Population - Millions.csv",
    "23. GDP (PPP) - Billion dollars.csv",
    "24. Stunted children, History and Forecast - Million.csv",
    # "25. Population under 5 Years, Headcount - Millions.csv",
    "26. Malnourished Children, Headcount - Millions.csv"
]
year_filter_config = {
    "year_range": {
        "years": list(range(2019, 2050)),
        "files": [
            "13. Sanitation Services, Access, percent of population (2nd Dimensions = Basic + Safely Managed).csv",
            "17. Water Services, Access, percent of population (2nd Dimension = Basic + Safely Managed).csv"
        ]
    },
    "milestone_years": [2030, 2050]
}
files = [
    f"{IFS_INPUT_DIR}/{f}" for f in os.listdir(IFS_INPUT_DIR)
    if os.path.isfile(os.path.join(IFS_INPUT_DIR, f))
]
files = [f"{IFS_INPUT_DIR}/{file}" for file in files_to_keep]

3.A. IFS Functions

IFS functions are a collection of functions used only by IFS data source

def base_jmp_category(x):
    if x["value_name"] == "Base":
        if x["category_base"] == "Basic":
            return "BS"
        if x["category_base"] == "SafelyManaged":
            return "SM"
        return np.nan
    return x["jmp_category"]
def get_ifs_name(source):
    source = re.sub(r"\s*\(2nd Dimension.*?\)", "", source)
    return re.sub(r'^\d+\. ', '', source.replace(f"{IFS_INPUT_DIR}/", "")).replace(".csv", "")
def get_value_types(lst):
    lst = lst.split('.')[0]
    lst = lst.replace('_0_','_0.').split("_")
    return lst
def cleanup_data(dataframe):
    dataframe['unit'] = dataframe['unit'].apply(lambda x: x.replace("2017","") if x else None)
    dataframe['value'] = dataframe['value'].apply(lambda x: x.replace(' ','') if ' ' in str(x) else x)
    dataframe['value'] = dataframe['value'].apply(lambda x: x if len(str(x)) > 0 else np.nan)
def filter_dataframe_by_year(dataframe, filename):
    filename = filename.split("/")[3]
    if filename in year_filter_config["year_range"]["files"]: # Filter using the year_range
        filtered_df = dataframe[dataframe['year'].isin(year_filter_config["year_range"]["years"])]
    else: # Filter using milestone_years
        filtered_df = dataframe[dataframe['year'].isin(year_filter_config["milestone_years"])]
    return filtered_df.reset_index(drop=True)
def remove_unmatches_jmp_category(x):
    if x["value_name"] != "Base":
        if x["category_base"] == "Basic" and x["jmp_category"] == "SM":
            return True
        if x["category_base"] == "SafelyManaged" and x["jmp_category"] == "BS":
            return True
    return False
def remove_unmatch_commitment(x):
    if str(x['commitment']).strip() == "2030":
        if str(x["year"]) != "2030":
            return True
    if str(x['commitment']).strip() == "2050":
        if str(x["year"]) != "2050":
            return True
    return False

3.B. IFS Data Processing

3.B.1. Combine, Filter and Remap IFS Values

This section describes the process of transforming and processing IFS data files into a unified DataFrame (combined_df). The transformation involves cleaning, reshaping, and filtering the data to prepare it for analysis.

combined_df = pd.DataFrame(columns=final_columns)
for file in files:
    print(f"Processing {file}")
    # test only 1 file
    # if file != "../input_data/IFs/14. Sanitation Services, Access, Number of people, million (2nd Dimensions = Basic + Safely Managed).csv":
    #   continue
    cleanup_semicolon(file)
    data = pd.read_csv(file, header=[1,2,4,5], sep=',')
    new_columns = list(data.columns)
    for i, col in enumerate(new_columns):
        if col == ('Unnamed: 0_level_0', 'Unnamed: 0_level_1', 'Unnamed: 0_level_2', 'Unnamed: 0_level_3'):
            new_columns[i] = 'Year'
    data.columns = new_columns
    df = pd.DataFrame(data.to_dict('records'))
    df_melted = df.melt(id_vars=['Year'], var_name='variable', value_name='value')
    new_data = []
    for value_list in df_melted.to_dict('records'):
        value_type = get_value_types(value_list["variable"][3])
        new_data.append({
            "year": int(value_list["Year"]),
            "country": map_country_name(value_list["variable"][0]),
            "category_base": value_list["variable"][1],
            "unit": value_list["variable"][2],
            "value_type": list(filter(lambda v:v,value_type)),
            "value": value_list["value"]
        })
    df = pd.DataFrame(new_data)
    df = filter_dataframe_by_year(df, file)
    df_split = pd.DataFrame(df['value_type'].tolist(), index=df.index)
    df_split.columns = ['value_name', 'jmp_category', 'commitment']
    df_final = pd.concat([df, df_split], axis=1)
    df_final['indicator'] = get_ifs_name(file)
    df_final['jmp_category'] = df_final.apply(base_jmp_category, axis=1)
    df_final['jmp_category'] = df_final['jmp_category'].apply(replace_bs_with_alb)
    df_final['remove'] = df_final.apply(remove_unmatches_jmp_category, axis=1)
    df_final = df_final[df_final['remove'] == False].reset_index(drop=True)
    df_final = df_final[final_columns]
    combined_df = pd.concat([combined_df.dropna(axis=1, how='all'), df_final], ignore_index=True)
Processing ../input_data/IFs/01. Deaths by Category of Cause - Millions (2nd Dimensions = Diarrhea).csv
Processing ../input_data/IFs/06. Poverty Headcount less than $2.15 per Day, Log Normal - Millions.csv
Processing ../input_data/IFs/08. State Failure Instability Event - IFs Index.csv
Processing ../input_data/IFs/11. Governance Effectiveness - WB index.csv
Processing ../input_data/IFs/13. Sanitation Services, Access, percent of population (2nd Dimensions = Basic + Safely Managed).csv
Processing ../input_data/IFs/14. Sanitation Services, Access, Number of people, million (2nd Dimensions = Basic + Safely Managed).csv
Processing ../input_data/IFs/15. Sanitation Services, Expenditure, Capital, Billion $ (2nd Dimensions = Basic + Safely Managed).csv
Processing ../input_data/IFs/16. Sanitation Services, Expenditure, Maintenance, Billion $ (2nd Dimensions = Basic + Safely Managed).csv
Processing ../input_data/IFs/17. Water Services, Access, percent of population (2nd Dimension = Basic + Safely Managed).csv
Processing ../input_data/IFs/18. Water Services, Access, Number of people, million (2nd Dimensions = Basic + Safely Managed).csv
Processing ../input_data/IFs/19. Water Services, Expenditure, Capital, Billion $ (2nd Dimensions = Basic + Safely Managed).csv
Processing ../input_data/IFs/20. Water Services, Expenditure, Maintenance, Billion $ (2nd Dimensions = Basic + Safely Managed).csv
Processing ../input_data/IFs/23. GDP (PPP) - Billion dollars.csv
Processing ../input_data/IFs/24. Stunted children, History and Forecast - Million.csv
Processing ../input_data/IFs/26. Malnourished Children, Headcount - Millions.csv
# Test for Congo
# combined_df[
#    (combined_df["indicator"] == "Sanitation Services, Access, Number of people, million") &
#    (combined_df["country"] == "Democratic Republic of the Congo") &
#    (combined_df["value_name"] == "Base")
#]

Remove rows when commitment doesn't match with the year