Dataset: reports

from datetime import datetime
import pandas as pd
import requests, zipfile, io, json, re, os, glob, shutil
from bs4 import BeautifulSoup
import urllib.parse

import src.utils as ut

# Setup the root path of the application
project_path = ut.project_path()

# Load the metadata

meta_filename = [
    f"{ut.project_path(1)}/meta/mosquito_alert/reports.json",
    f"{ut.project_path(2)}/meta_ipynb/reports.html",
]
metadata = ut.load_metadata(meta_filename)

# Get contentUrl from metadata file
ut.info_meta(metadata)

1. Distribution from Zenodo cloud

This dataset is updated nightly and the most recent version can be downloaded from Zenodo at https://doi.org/10.5281/zenodo.597466. This URL will always resolve to the most recent version of the data.

# Get metadata
contentUrl, dataset_name, distr_name = ut.get_meta(
    metadata, idx_distribution=1, idx_hasPart=None
)

# Make folders for data download
path = f"{project_path}/data/{dataset_name}/{distr_name}"
ut.makedirs(path)
# Download and open the zip container

# Get the latest Zenodo file version of the dataset
r = requests.get(contentUrl)
file_url = BeautifulSoup(r.content, "html.parser").find(
    "a", {"class": "ui compact mini button"}
)["href"]
file_contentUrl = urllib.parse.urljoin(r.url, file_url)

# Download the dataset
r_file = requests.get(file_contentUrl)
z = zipfile.ZipFile(io.BytesIO(r_file.content))

We have the option to extract all the file reports into a distribution folder.

# Extract everything that was downloaded
z.extractall(path)

# Extract all the reports
for f in glob.glob(f"{path}/*/*.zip"):
    with zipfile.ZipFile(f, "r") as zipObj:
        for zip in zipObj.filelist:
            filename = os.path.basename(zip.filename)
            with open(f"{path}/{filename}", "wb") as f:
                f.write(zipObj.open(zip).read())

# Move the translation_dict and remove the download folder
trans_dict = "translation_dict.json"
path_download = f"{path}/{z.filelist[0].filename}"
shutil.move(path_download + trans_dict, path + "/" + trans_dict)
shutil.rmtree(path_download)

Or we could concatenate all reports into a single dataframe before and save it as a file.

# Merge all reports into a dataframe. Json is a nested data format, thus one should
# decide to what level data should be unrolled (i.e. json sublevels are transformed
# to new columns). Note that json flattening implies a bigger table with many columns.
# Here we provide two options: a compact table (level = 0) and fully expanded
# table (level = None). For more information, see Pandas function
# [_json_normalize_](https://pandas.pydata.org/docs/reference/api/pandas.json_normalize.html)

level = 0  # normalize only the root level (keys->column names, values->table values)
# level = None  # normalize all levels

df_reports = []
df_responses = []

for report_name in glob.glob(f"{path}/all_reports*.json"):
    with open(report_name) as f:
        d = json.loads(f.read())
    df_dict = pd.json_normalize(d, max_level=level)
    df_reports.append(df_dict)
    if level is None:
        df_list = pd.json_normalize(d, max_level=level, record_path=["responses"])
        df_responses.append(df_list)

df = pd.concat(df_reports)
if level is None:
    df_responses = pd.concat(df_responses)

Note

If we normalize all the levels (i.e. level = None), than the responses attribute of reports dataset is a list of dictionaries and thus it is bypassed by json_normalize. In this case, we need to explicitly flatten responses and concatenate with all the other attributes. On the other hand, if the table is compact (i.e. level = 0), we could use json_normalize to unroll the attributes of our interest but first we need to transform them in a dictionary data format.

Let’s make an example on how to unroll nested attributes. Suppose that would like to study mosquito bites in Netherlands. This information is given by the attribute responses.

# Unroll the _responses_ attribute if we only unrolled the root level
if level == 0:
    col = "responses"
    df_responses = pd.json_normalize(
        df[[col]].to_dict(orient="records"), record_path=col
    )

# Join responses with some useful attributes from the main table to filter and
# group bites over space and time
col = ["version_UUID", "creation_year", "creation_month", "type", "country"]
df_responses = df_responses.merge(
    df[col], left_on="report", right_on="version_UUID", validate="m:m"
).drop(["report"], axis=1)

# Transform float types to integers. Note that Int' has a NaN type, while 'int'
# do not have NaNs thus filtering is not possible if missing values are present.
df_responses = df_responses.astype(
    {
        "question_id": "Int16",
        "answer_id": "Int16",
        "answer_value": "Int16",
    }
)
# Filter for total mosquito bites registered by a user in Netherlands (code ISO3166-alpha3)
df_bite = df_responses.query("type == 'bite' & country == 'NLD' & question_id == 1")

# Count bites on monthly basis grouped by year
df_bite_stats = df_bite.groupby(["creation_year", "creation_month"])[
    "answer_value"
].sum()

#  Plot the temporal coverage of bites for Netherlands on log-scale
p = df_bite_stats.unstack(level=0).plot(
    kind="bar", stacked=True, logy=True, ylabel="Bite counts"
)

Some attributes of reports are key-value json-like data, that need additional tables to be fully comprehensive (for example, tiger_responses, responses, etc). Since multi-language translations are available, we make language as index.

# Open translation dictionary
with open(path + "/" + trans_dict) as f:
    r = f.read()

try:
    d = json.loads(r)
except ValueError:
    print("Warning: not a valid Json format. Try to get rid of trailing comma.")
try:
    r = re.sub(r"\"\s*,\s*\}", '" }', r)
    d = json.loads(r)
except ValueError:
    print("Json format is still not valid.")

df_reports_translation = pd.DataFrame.from_dict(d, orient="index")
df_reports_translation.info()

Imagine that you need to translate the questions and answers of mosquito bites (i.e. df_responses dataframe) in another language. The above translation table could do this job for you.

# For example, let's translate to English.
lang = "en"
translation = df_reports_translation.loc[lang].to_dict()
answers = df_responses["answer"].map(translation)
questions = df_responses["question"].map(translation)
df_responses.insert(1, f"translated_answer_{lang}", answers)
df_responses.insert(1, f"translated_question_{lang}", questions)

Note

Not all the reports could be translated since they miss the question and answer ID tag due to an older version of the Mosquito Alert mobile app. Starting from 2020, translations are available.

# Save reports on CSV or parquet
filename = f"{path}/all_reports"
df.to_parquet(f"{filename}.parquet")  # very low file-size (need to install pyArrow)
df.to_csv(f"{filename}.csv")  # x10 size if compared with the dataframe

# Save the reports responses unrolled
df_responses.to_parquet(f"{filename}_responses.parquet")
df_responses.to_csv(f"{filename}_responses.csv")

# Save seports translation on CSV
df_reports_translation.to_csv(f"{filename}_translation.csv")

2. Distribution from MosquitoAlert Github repository

This dataset is also updated daily on GitHub and can be accessed from there.

# Get metadata
contentUrl, dataset_name, distr_name = ut.get_meta(
    metadata, idx_distribution=1, idx_hasPart=None
)

# Make folders for data download
path = f"{project_path}/data/{dataset_name}/{distr_name}"
ut.makedirs(path)
# Download the dataset
r_file = requests.get(contentUrl[0])
z = zipfile.ZipFile(io.BytesIO(r_file.content))

We have the option to extract all the file reports into a distribution folder.

# Extract everything that was downloaded
z.extractall(path)

# Move the report files and remove the download folder
for file in glob.glob(f"{path}/**/*.json", recursive=True):
    shutil.copy(file, path)

path_download = os.path.dirname(file)
shutil.rmtree(path_download)
# The procedure of conversion from json to csv is equal to the Zenodo distribution example

level = 0  # normalize only the root level (keys->column names, values->table values)
# level = None  # normalize all levels

df_reports = []
df_responses = []

for report_name in glob.glob(f"{path}/all_reports*.json"):
    with open(report_name) as f:
        d = json.loads(f.read())
    df_dict = pd.json_normalize(d, max_level=level)
    df_reports.append(df_dict)
    if level is None:
        df_list = pd.json_normalize(d, max_level=level, record_path=["responses"])
        df_responses.append(df_list)

df = pd.concat(df_reports)
if level is None:
    df_responses = pd.concat(df_responses)
# Request other support material of the reports and put them into dataframes
# Since multi-language translations are available, we make language as index

url = contentUrl[-1]
trans_dict = f"{path}/translation_dict.json"
ut.download_file(url, trans_dict)
# Open translation dictionary
with open(trans_dict) as f:
    r = f.read()

try:
    d = json.loads(r)
except ValueError:
    print("Warning: not a valid Json format. Try to get rid of trailing comma.")
try:
    r = re.sub(r"\"\s*,\s*\}", '" }', r)
    d = json.loads(r)
except ValueError:
    print("Json format is still not valid.")

df_reports_translation = pd.DataFrame.from_dict(d, orient="index")
df_reports_translation.info()
# Save reports on CSV or parquet
filename = f"{path}/all_reports"
df.to_parquet(f"{filename}.parquet")  # very low file-size (need to install pyArrow)
df.to_csv(f"{filename}.csv")  # x10 size if compared with the dataframe

# Save seports translation on CSV
df_reports_translation.to_csv(f"{filename}_translation.csv")