Dataset: human_presence
Contents
Dataset: human_presence¶
import os
import pandas as pd
import rasterio
import requests
import src.utils as ut
# Setup the root path of the application
project_path = ut.project_path()
# Load the metadata
meta_filename = [
f"{ut.project_path(1)}/meta/environment/human_presence.json",
f"{ut.project_path(2)}/meta_ipynb/human_presence.html",
]
metadata = ut.load_metadata(meta_filename)
# Get contentUrl from metadata file
ut.info_meta(metadata)
Metadata name: human_presence
distribution 0: mosquitoalert_webserver
1. Distribution by SFTP download from MosquitoAlert webserver¶
# Get metadata
contentUrl, dataset_name, distr_name = ut.get_meta(
metadata, idx_distribution=0, idx_hasPart=None
)
# Make folders for data download
path = f"{project_path}/data/{dataset_name}/{distr_name}"
ut.makedirs(path)
Info
dataset name: human_presence
distribution name: mosquitoalert_webserver
distribution description: Distribution by HTTP download from MosquitoAlert webserver
# Download the raster from the web-server
r = requests.get(contentUrl, stream=True)
filepath = os.path.join(path, os.path.basename(contentUrl))
with open(filepath, "wb") as f:
f.write(r.content)
Open the raster with Rasterio and adjust the “no data” to be equal -1.
with rasterio.open(filepath) as src:
raster = src.read()
# Setup nodata value
raster[0, :, :][raster[0, :, :] == 0] = -1 # climatic regions
raster[1, :, :][raster[1, :, :] < 0] = -1 # population density
raster[2, :, :][raster[2, :, :] == 0] = -1 # gadm
raster[3, :, :][raster[3, :, :] == 0] = -1 # ecoregions
n_bands = raster.shape[0]
raster = raster.T.reshape((-1, n_bands))
raster = raster[(raster != -1).all(axis=1)]
raster = pd.DataFrame(
raster,
columns=[
"code_climate_regions",
"human_presence",
"code_gadm",
"code_ecoregions",
],
)
# Reorder the bands
raster = raster[
[
"human_presence",
"code_gadm",
"code_climate_regions",
"code_ecoregions",
]
]
raster.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24604668 entries, 0 to 24604667
Data columns (total 4 columns):
# Column Dtype
--- ------ -----
0 human_presence int32
1 code_gadm int32
2 code_climate_regions int32
3 code_ecoregions int32
dtypes: int32(4)
memory usage: 375.4 MB
# Save on parquet
filename = f"{path}/human_presence"
raster.to_parquet(f"{filename}.parquet")