AI for Health - COVID-19

📆 Project Period	October, 2023
📍 GitHub	gitlab.esa.int

# importing packages

import calendar
import datetime
import os.path
import cdsapi
import numpy as np
from netCDF4 import Dataset
import pandas as pd
import xarray as xr
import pdb
import netCDF4
from geopandas.tools import sjoin
from time import process_time
import pickle


# to cope with the windows bug on basemap - not needed on Linux
# The folder can change on different computers
# os.environ['PROJ_LIB'] = r'C:\\Users\\dario\\Anaconda3\\pkgs\\proj4-5.2.0-h6538335_1006\\Library\\share'
from mpl_toolkits.basemap import Basemap
import mpl_toolkits.basemap as bmp
import matplotlib.pyplot as plt
# To install geopands on Windows, follow the instructions reported on the page:
# <https://www.hatarilabs.com/ih-en/how-to-install-python-geopandas-on-anaconda-in-windows-tutorial>
import geopandas as gpd

# US Geographical region
# (North, West, South, East)
geo_limits = [50, -126, 24, -66]

Download of ERA5 data for Covid-19 analysis in US during the first wave

This code manages the download of atmospheric data from ERA5 reanalysis repository. Moreover, some processing on the data is performed, such as grouping the data over the US counties and avaraging the information for each day.

Downloading atmospheric values @1000 hPa at 0.25 deg resolution

# from <https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-pressure-levels?tab=form>

c = cdsapi.Client()

if os.path.exists(f'./US_data_01deg_res_hum_and_temp_with_wind.nc') is False:
    c.retrieve(
        'reanalysis-era5-pressure-levels',
        {
            'product_type': 'reanalysis',
            'format': 'netcdf',
            'variable': [
                'ozone_mass_mixing_ratio', 'relative_humidity',
                'specific_humidity', 'temperature',
            ],
            'pressure_level': '1000',
            'year': [
                '2020'
            ],
            'month': [
                '01', '02', '03',
                '04', '05', '06',
                '07', '08'
            ],
            'day': [
                '01', '02', '03',
                '04', '05', '06',
                '07', '08', '09',
                '10', '11', '12',
                '13', '14', '15',
                '16', '17', '18',
                '19', '20', '21',
                '22', '23', '24',
                '25', '26', '27',
                '28', '29', '30',
                '31'
            ],
            'time': [
                '00:00', '06:00', '12:00', '18:00'
            ],
            'area': geo_limits,
            'grid': [0.1,0.1],
        },
        f'US_data_01deg_res_hum_and_temp.nc')
else:
    print(f'The file US_data_01deg_res_hum_and_temp.nc has been already downloaded!')

Creating the dataset with daily frequency

# To run this piece of code, we need:
# 1. US_data_025deg_res_hum_and_temp
# 2. US_data_025deg_res


def print_data(*argv):

    input_file = argv[0]
    with xr.open_dataset(input_file) as nc:
        print(nc.variables)

    return 0

def add_hum_poll_data(*argv):

    input_file = argv[0]

    with xr.open_dataset(input_file) as nc:
        #print(nc.variables)

        # deleting Nan and removing expver
        if 'expver' in list(nc.variables):
            nc = nc.sel(expver=1).fillna(nc.sel(expver=5))

        # resampling the dataset with 1day frequency
        nc_1Dresample = nc.resample(time='1D').mean().rename({'t':'temp_mean','o3':'o3_mean','r':'relHum_mean','q':'specHum_mean'}).merge(
                nc.resample(time='1D').std().rename({'t':'temp_std','o3':'o3_std','r':'relHum_std','q':'specHum_std'})).merge(
                nc.resample(time='1D').max().rename({'t':'temp_max','o3':'o3_max','r':'relHum_max','q':'specHum_max'})).merge(
                nc.resample(time='1D').min().rename({'t':'temp_min','o3':'o3_min','r':'relHum_min','q':'specHum_min'})).merge(
                nc.resample(time='1D').median().rename({'t':'temp_median','o3':'o3_median','r':'relHum_median','q':'specHum_median'}))

    if len(argv) == 2:
        nc_1Dresample = argv[1].merge(nc_1Dresample)


    return nc_1Dresample

dataset_US = add_hum_poll_data('US_data_01deg_res_hum_and_temp.nc')

C:\\Users\\dario\\anaconda3\\lib\\site-packages\\xarray\\core\\common.py:1106: FutureWarning: 'base' in .resample() and in Grouper() is deprecated.
The new arguments that you should use are 'offset' or 'origin'.

>>> df.resample(freq="3s", base=2)

becomes:

>>> df.resample(freq="3s", offset="2s")

  freq=freq, closed=closed, label=label, base=base, loffset=loffset

# Transform the dataset in a Pandas dataframe (just for visual inspection)
dataset_US.to_dataframe()

# Saving the dataframe in the current working folder
dataset_US.to_netcdf('US_data.nc',format='NETCDF4')

Visual inspection of the ERA5 data

# Read the dataset
input_dataset = 'US_data_01deg_res_hum_and_temp.nc'

ds = Dataset(input_dataset,'r')
# Extracting longitudes and latitudes
lons = ds.variables['longitude'][:]
lats = np.sort(ds.variables['latitude'][:])

def initialize_map(proj,res,x,y,lats,lons):
    """This function initialize the basemap for data visualization.

    Args:
        proj (str): 'merc' or 'stere'
        res (str): resolution: 'c' (crude), 'l' (low), 'i' (intermediate), 'h' (high), 'f' (full)
        x (ndarray): x Cartesian coordinates in m
        y (ndarray): y Cartesian coordinates in m
        lats (ndarray):  latitude coordinates in degree
        lons (ndarray):  longitude coordinates in degree

    Returns:
        m (): The map.

    inspired by <https://joehamman.com/2013/10/12/plotting-netCDF-data-with-Python/>

    """

    if proj == 'stere':
        lon_0 = lons.mean()
        lat_0 = lats.mean()
        delta_x, delta_y = get_delta_CartCoord(x,y)
        m = Basemap(width = 2*delta_x, height = 2*delta_y,
                    resolution = res, projection = proj,
                    lat_ts = 40, lat_0 =lat_0, lon_0 = lon_0)

    elif proj == 'merc':
        m = Basemap(projection='merc', llcrnrlat = np.amin(lats), urcrnrlat = np.amax(lats),
                    llcrnrlon = np.amin(lons), urcrnrlon = np.amax(lons), lat_ts = 5,
                    resolution = res)

    return m

T0 = ds['t'][0,::]
T_unit = ds['t'].units

# Plotting the temperature without a reference map
fig,ax = plt.subplots(figsize=(15,15))
plt.imshow(T0)   # ,cmap=cm.gray
plt.show()

# Plotting the temperature along with a reference map

# resolution: c (crude), l (low), i (intermediate), h (high), f (full)
proj = 'merc'
res = 'h'
# Initialization of the map
m = initialize_map(proj,res,0,0,lats,lons)

lon2d, lat2d = np.meshgrid(lons, lats)
xi, yi = m(lon2d, lat2d)
fig,ax = plt.subplots(figsize=(15,15)) # define the window dimension


cs = m.pcolor(xi,yi,np.flip(T0,0))


# Add Grid Lines
m.drawparallels(np.arange(-80., 81., 10.), labels=[1,0,0,0], fontsize=10)
m.drawmeridians(np.arange(-180., 181., 10.), labels=[0,0,0,1], fontsize=10)

# Add Coastlines, States, and Country Boundaries
m.drawcoastlines()
m.drawstates()
m.drawcountries()

# Add Colorbar
cbar = m.colorbar(cs, location='bottom', pad="10%")
cbar.set_label(T_unit)

# Add Title
plt.title('CDS 2-meters Temperature US')

# closing the dataset
ds.close()

Creation of the dataset with information for each US county

input_nc_file = 'US_data.nc'

# Importing the shapefile for US
US_shp = gpd.read_file("US/admin_regions.shp")

# Evaluation of the counties centroids
centroids = pd.DataFrame(list(zip(US_shp.index,US_shp.name,US_shp.geometry.centroid.x,US_shp.geometry.centroid.y)))
centroids = centroids.set_index(0)

centroids.index.name = None

centroids = centroids.rename(columns={1:'name',2:'lon',3:'lat'})

centroids

Mask for successive plotting (it only affects plots!)

mask = []
for line in range(US_shp.shape[0]):
    a = (US_shp.iloc[line].geometry.centroid.x > -126)
    b = (US_shp.iloc[line].geometry.centroid.y < 50) #55
    c = (US_shp.iloc[line].geometry.centroid.x < -66)
    d = (US_shp.iloc[line].geometry.centroid.y > 24)  #15
    mask.append(a and b and c and d)
    #mask.append(a)

Create fine mesh (0.01 deg) and interpolate variables

with Dataset(input_nc_file, mode='r') as fh:
    time_var = fh.variables['time']
    times = netCDF4.num2date(time_var[:],time_var.units,
                         only_use_cftime_datetimes=False,
                         only_use_python_datetimes=True)
    lons = fh.variables['longitude'][:]
    lats = np.sort(fh.variables['latitude'][:])

lats_fine = np.arange(np.min(lats), np.max(lats), 0.01) # 0.25 degree fine grid
lons_fine = np.arange(np.min(lons), np.max(lons), 0.01)
lons_sub, lats_sub = np.meshgrid(lons_fine, lats_fine)

# Function to create the mask associated to each single state: it is used to avoid the group by operation

def get_state_mask(lon_list, lat_list, lons_sub, lats_sub):

    lon_idx = []
    lat_idx = []
    lon_app = lon_idx.append
    lat_app = lat_idx.append

    for lon,lat in zip(lon_list, lat_list):
        # get lon index
        a = (np.abs(lons_sub - lon) < 1e-8)
        lon_app([i for i, x in enumerate(a) if x])
        # get lat index
        b = (np.abs(lats_sub - lat) < 1e-8)
        lat_app([i for i, x in enumerate(b) if x ])

    return (np.array(lon_idx),np.array(lat_idx))

Create the list of points for the grouping operation

aa = np.repeat(lats_fine, lons_fine.shape[0])
bb = np.tile(lons_fine,lats_fine.shape[0])

cc = np.array([aa,bb])
points = pd.DataFrame({'lon': bb,'lat': aa})

points

Grouping the atmospherica data and the shapefile using the function sjoin

It takes a little bit of time to run

if os.path.exists('./US_grouped.csv') is False:

    t0 = process_time()

    point_within = gpd.GeoDataFrame(points, geometry=gpd.points_from_xy(points.lon, points.lat)).set_crs("EPSG:4269")

    pointInPolys = sjoin(point_within, US_shp, how = 'inner', op= 'within')

    t1 = process_time()
    print("Time elapsed: ", t1 - t0) # CPU seconds elapsed (floating point)

    #pointInPolys.to_csv('US_grouped.csv')

else:
    print('The grouped file for US already exists! Loading cached file...')
    pointInPolys = pd.read_csv('US_grouped.csv').set_index('Unnamed: 0',drop=True)
    pointInPolys.index.name = None

pointInPolys

Analysing the results of the grouping operation

# Raggruppo secondo il nome dello stato
group_var = 'index_right'  # or name, but there are same names related to different areas!
grouped = pointInPolys.groupby(group_var)

# trasformo grouped in una lista
grouped_list = list(grouped)

print('State name associated to the first element: ',grouped_list[0][0])
print('Dataframe associated to the first element: ',grouped_list[1][1])

len(np.unique(pointInPolys[group_var]))

Getting the mask for each single state of US

if os.path.exists(f'US_mask_by_{group_var}.csv') is False:

    mask_list = []
    append = mask_list.append

    for num,item in enumerate(grouped_list):
        print(f'Processing element {num+1} of {len(grouped_list)}...')
        sub_df = item[1]
        sub_mask = get_state_mask(list(sub_df.lon), list(sub_df.lat), lons_sub[0,:], lats_sub[:,0])
        append((item[0],item[1].name.iloc[0],sub_mask))

    with open(f'US_mask_by_{group_var}.csv','wb') as fp:
        pickle.dump(mask_list,fp)
else:
    print(f'The file US_mask_by_{group_var}.csv already exist!')
    with open(f'US_mask_by_{group_var}.csv', "rb") as fp:   # Unpickling
        mask_list= pickle.load(fp)

mask_list[:10]

Creating the final dataframe using interpolation

This passage is quite slow and can be optimized by using parallel computation...

finer = {}

L = len(times)
T = (np.round(L/100))

if os.path.exists('stats_US.csv') is False:

    with Dataset(input_nc_file, mode='r') as fh:

        variables = list(fh.variables)
        variables.remove('time')
        variables.remove('latitude')
        variables.remove('longitude')
        print(variables)
        first_iter = True
        for variable in variables:
            print('Processing ',variable)
            var = np.flip(fh.variables[variable][:],1)  # flip to consider the sorting applied to the latitude
            ind = 0
            for count,day in enumerate(times):
                #if (count%T == 0):
                    #print(f'{count/L*100}% of the days...')
                # order=1 means bilinear interpolation
                day_formatted = day.strftime("%Y-%m-%d")
                interp_data = bmp.interp(var[count,:,:], lons, lats, lons_sub, lats_sub, order=1)
                for state_mask in mask_list:
                    state_data = np.mean(interp_data[state_mask[2][1],state_mask[2][0]])
                    if first_iter:
                        index = state_mask[0]
                        name = state_mask[1]
                        x_cen = centroids.loc[index]['lon']
                        y_cen = centroids.loc[index]['lat']
                        finer[ind] = ((day_formatted,f'{index}',f'{name}',f'{x_cen}',f'{y_cen}',state_data))
                    else:
                        finer[ind] = finer[ind] + (state_data,)
                    ind += 1
            first_iter = False

else:
    print(f'The file stats_US.csv already exists!')
    df_result = pd.read_csv('stats_US.csv')
    df_result = df_result.set_index(['date','shp_index','state','lon','lat'])

# Post-processing the dataframe before saving. Setting the indices and the measurement units

if os.path.exists('stats_US.csv') is False:
    df_result = pd.DataFrame.from_dict(finer,orient='index')

    units = []
    for var in variables:
        if 'o3' in var:
            units.append(' (kg kg^-1)')
        elif 'relHum' in var:
            units.append( ' (%)')
        elif 'specHum' in var:
            units.append(' (kg kg^-1)')
        elif 'temp' in var:
            units.append(' (K)')
        elif 'wind' in var:
            units.append(' (m s^-1)')

    col_names0 = ['date','shp_index','state','lon','lat']
    col_names1 = [x+y for x,y in zip(variables,units)]
    col_names = col_names0 + col_names1

    new_cols = {num:name for num,name in zip(range(len(col_names)),col_names)}
    df_result = df_result.rename(columns=new_cols)


    df_result = df_result.set_index(col_names0)
    df_result = df_result.reindex(sorted(df_result.columns), axis=1)

    df_result.to_csv('stats_US.csv')

    print(df_result)

else:
    print(f'The file stats_US.csv already exists!')
    print(df_result)