Dataset: human_presence

import os

import pandas as pd
import rasterio
import requests
import src.utils as ut

# Setup the root path of the application
project_path = ut.project_path()

# Load the metadata

meta_filename = [
    f"{ut.project_path(1)}/meta/environment/human_presence.json",
    f"{ut.project_path(2)}/meta_ipynb/human_presence.html",
]
metadata = ut.load_metadata(meta_filename)

# Get contentUrl from metadata file
ut.info_meta(metadata)
Metadata name: human_presence
	distribution 0: mosquitoalert_webserver

1. Distribution by SFTP download from MosquitoAlert webserver

# Get metadata
contentUrl, dataset_name, distr_name = ut.get_meta(
    metadata, idx_distribution=0, idx_hasPart=None
)

# Make folders for data download
path = f"{project_path}/data/{dataset_name}/{distr_name}"
ut.makedirs(path)
Info
dataset name: human_presence
distribution name: mosquitoalert_webserver
distribution description: Distribution by HTTP download from MosquitoAlert webserver
# Download the raster from the web-server
r = requests.get(contentUrl, stream=True)
filepath = os.path.join(path, os.path.basename(contentUrl))
with open(filepath, "wb") as f:
    f.write(r.content)

Open the raster with Rasterio and adjust the “no data” to be equal -1.

with rasterio.open(filepath) as src:
    raster = src.read()
    # Setup nodata value
    raster[0, :, :][raster[0, :, :] == 0] = -1  # climatic regions
    raster[1, :, :][raster[1, :, :] < 0] = -1  # population density
    raster[2, :, :][raster[2, :, :] == 0] = -1  # gadm
    raster[3, :, :][raster[3, :, :] == 0] = -1  # ecoregions

    n_bands = raster.shape[0]
    raster = raster.T.reshape((-1, n_bands))
    raster = raster[(raster != -1).all(axis=1)]

    raster = pd.DataFrame(
        raster,
        columns=[
            "code_climate_regions",
            "human_presence",
            "code_gadm",
            "code_ecoregions",
        ],
    )
    # Reorder the bands
    raster = raster[
        [
            "human_presence",
            "code_gadm",
            "code_climate_regions",
            "code_ecoregions",
        ]
    ]

raster.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24604668 entries, 0 to 24604667
Data columns (total 4 columns):
 #   Column                Dtype
---  ------                -----
 0   human_presence        int32
 1   code_gadm             int32
 2   code_climate_regions  int32
 3   code_ecoregions       int32
dtypes: int32(4)
memory usage: 375.4 MB
# Save on parquet
filename = f"{path}/human_presence"
raster.to_parquet(f"{filename}.parquet")