# US Geographical region
# (North, West, South, East)
geo_limits = [50, -126, 24, -66]
Download of ERA5 data for Covid-19 analysis in US during the first wave
This code manages the download of atmospheric data from ERA5 reanalysis repository. Moreover, some processing on the data is performed, such as grouping the data over the US counties and avaraging the information for each day.
Downloading atmospheric values @1000 hPa at 0.25 deg resolution
# Transform the dataset in a Pandas dataframe (just for visual inspection)
dataset_US.to_dataframe()
# Saving the dataframe in the current working folder
dataset_US.to_netcdf('US_data.nc',format='NETCDF4')
# Plotting the temperature without a reference map
fig,ax = plt.subplots(figsize=(15,15))
plt.imshow(T0) # ,cmap=cm.gray
plt.show()
# closing the dataset
ds.close()
Creation of the dataset with information for each US county
input_nc_file = 'US_data.nc'
# Importing the shapefile for US
US_shp = gpd.read_file("US/admin_regions.shp")
# Evaluation of the counties centroids
centroids = pd.DataFrame(list(zip(US_shp.index,US_shp.name,US_shp.geometry.centroid.x,US_shp.geometry.centroid.y)))
centroids = centroids.set_index(0)
centroids.index.name = None
centroids = centroids.rename(columns={1:'name',2:'lon',3:'lat'})
centroids
Mask for successive plotting (it only affects plots!)
mask = []
for line in range(US_shp.shape[0]):
a = (US_shp.iloc[line].geometry.centroid.x > -126)
b = (US_shp.iloc[line].geometry.centroid.y < 50) #55
c = (US_shp.iloc[line].geometry.centroid.x < -66)
d = (US_shp.iloc[line].geometry.centroid.y > 24) #15
mask.append(a and b and c and d)
#mask.append(a)
Create fine mesh (0.01 deg) and interpolate variables
Create the list of points for the grouping operation
aa = np.repeat(lats_fine, lons_fine.shape[0])
bb = np.tile(lons_fine,lats_fine.shape[0])
cc = np.array([aa,bb])
points = pd.DataFrame({'lon': bb,'lat': aa})
points
Grouping the atmospherica data and the shapefile using the function sjoin
It takes a little bit of time to run
Analysing the results of the grouping operation
Getting the mask for each single state of US
mask_list[:10]
Creating the final dataframe using interpolation
This passage is quite slow and can be optimized by using parallel computation...
# importing packages
import calendar
import datetime
import os.path
import cdsapi
import numpy as np
from netCDF4 import Dataset
import pandas as pd
import xarray as xr
import pdb
import netCDF4
from geopandas.tools import sjoin
from time import process_time
import pickle
# to cope with the windows bug on basemap - not needed on Linux
# The folder can change on different computers
# os.environ['PROJ_LIB'] = r'C:\\Users\\dario\\Anaconda3\\pkgs\\proj4-5.2.0-h6538335_1006\\Library\\share'
from mpl_toolkits.basemap import Basemap
import mpl_toolkits.basemap as bmp
import matplotlib.pyplot as plt
# To install geopands on Windows, follow the instructions reported on the page:
# <https://www.hatarilabs.com/ih-en/how-to-install-python-geopandas-on-anaconda-in-windows-tutorial>
import geopandas as gpd
# To run this piece of code, we need:
# 1. US_data_025deg_res_hum_and_temp
# 2. US_data_025deg_res
def print_data(*argv):
input_file = argv[0]
with xr.open_dataset(input_file) as nc:
print(nc.variables)
return 0
def add_hum_poll_data(*argv):
input_file = argv[0]
with xr.open_dataset(input_file) as nc:
#print(nc.variables)
# deleting Nan and removing expver
if 'expver' in list(nc.variables):
nc = nc.sel(expver=1).fillna(nc.sel(expver=5))
# resampling the dataset with 1day frequency
nc_1Dresample = nc.resample(time='1D').mean().rename({'t':'temp_mean','o3':'o3_mean','r':'relHum_mean','q':'specHum_mean'}).merge(
nc.resample(time='1D').std().rename({'t':'temp_std','o3':'o3_std','r':'relHum_std','q':'specHum_std'})).merge(
nc.resample(time='1D').max().rename({'t':'temp_max','o3':'o3_max','r':'relHum_max','q':'specHum_max'})).merge(
nc.resample(time='1D').min().rename({'t':'temp_min','o3':'o3_min','r':'relHum_min','q':'specHum_min'})).merge(
nc.resample(time='1D').median().rename({'t':'temp_median','o3':'o3_median','r':'relHum_median','q':'specHum_median'}))
if len(argv) == 2:
nc_1Dresample = argv[1].merge(nc_1Dresample)
return nc_1Dresample
C:\\Users\\dario\\anaconda3\\lib\\site-packages\\xarray\\core\\common.py:1106: FutureWarning: 'base' in .resample() and in Grouper() is deprecated.
The new arguments that you should use are 'offset' or 'origin'.
>>> df.resample(freq="3s", base=2)
becomes:
>>> df.resample(freq="3s", offset="2s")
freq=freq, closed=closed, label=label, base=base, loffset=loffset
def initialize_map(proj,res,x,y,lats,lons):
"""This function initialize the basemap for data visualization.
Args:
proj (str): 'merc' or 'stere'
res (str): resolution: 'c' (crude), 'l' (low), 'i' (intermediate), 'h' (high), 'f' (full)
x (ndarray): x Cartesian coordinates in m
y (ndarray): y Cartesian coordinates in m
lats (ndarray): latitude coordinates in degree
lons (ndarray): longitude coordinates in degree
Returns:
m (): The map.
inspired by <https://joehamman.com/2013/10/12/plotting-netCDF-data-with-Python/>
"""
if proj == 'stere':
lon_0 = lons.mean()
lat_0 = lats.mean()
delta_x, delta_y = get_delta_CartCoord(x,y)
m = Basemap(width = 2*delta_x, height = 2*delta_y,
resolution = res, projection = proj,
lat_ts = 40, lat_0 =lat_0, lon_0 = lon_0)
elif proj == 'merc':
m = Basemap(projection='merc', llcrnrlat = np.amin(lats), urcrnrlat = np.amax(lats),
llcrnrlon = np.amin(lons), urcrnrlon = np.amax(lons), lat_ts = 5,
resolution = res)
return m
# Plotting the temperature along with a reference map
# resolution: c (crude), l (low), i (intermediate), h (high), f (full)
proj = 'merc'
res = 'h'
# Initialization of the map
m = initialize_map(proj,res,0,0,lats,lons)
lon2d, lat2d = np.meshgrid(lons, lats)
xi, yi = m(lon2d, lat2d)
fig,ax = plt.subplots(figsize=(15,15)) # define the window dimension
cs = m.pcolor(xi,yi,np.flip(T0,0))
# Add Grid Lines
m.drawparallels(np.arange(-80., 81., 10.), labels=[1,0,0,0], fontsize=10)
m.drawmeridians(np.arange(-180., 181., 10.), labels=[0,0,0,1], fontsize=10)
# Add Coastlines, States, and Country Boundaries
m.drawcoastlines()
m.drawstates()
m.drawcountries()
# Add Colorbar
cbar = m.colorbar(cs, location='bottom', pad="10%")
cbar.set_label(T_unit)
# Add Title
plt.title('CDS 2-meters Temperature US')
with Dataset(input_nc_file, mode='r') as fh:
time_var = fh.variables['time']
times = netCDF4.num2date(time_var[:],time_var.units,
only_use_cftime_datetimes=False,
only_use_python_datetimes=True)
lons = fh.variables['longitude'][:]
lats = np.sort(fh.variables['latitude'][:])
lats_fine = np.arange(np.min(lats), np.max(lats), 0.01) # 0.25 degree fine grid
lons_fine = np.arange(np.min(lons), np.max(lons), 0.01)
lons_sub, lats_sub = np.meshgrid(lons_fine, lats_fine)
# Function to create the mask associated to each single state: it is used to avoid the group by operation
def get_state_mask(lon_list, lat_list, lons_sub, lats_sub):
lon_idx = []
lat_idx = []
lon_app = lon_idx.append
lat_app = lat_idx.append
for lon,lat in zip(lon_list, lat_list):
# get lon index
a = (np.abs(lons_sub - lon) < 1e-8)
lon_app([i for i, x in enumerate(a) if x])
# get lat index
b = (np.abs(lats_sub - lat) < 1e-8)
lat_app([i for i, x in enumerate(b) if x ])
return (np.array(lon_idx),np.array(lat_idx))
if os.path.exists('./US_grouped.csv') is False:
t0 = process_time()
point_within = gpd.GeoDataFrame(points, geometry=gpd.points_from_xy(points.lon, points.lat)).set_crs("EPSG:4269")
pointInPolys = sjoin(point_within, US_shp, how = 'inner', op= 'within')
t1 = process_time()
print("Time elapsed: ", t1 - t0) # CPU seconds elapsed (floating point)
#pointInPolys.to_csv('US_grouped.csv')
else:
print('The grouped file for US already exists! Loading cached file...')
pointInPolys = pd.read_csv('US_grouped.csv').set_index('Unnamed: 0',drop=True)
pointInPolys.index.name = None
pointInPolys
# Raggruppo secondo il nome dello stato
group_var = 'index_right' # or name, but there are same names related to different areas!
grouped = pointInPolys.groupby(group_var)
# trasformo grouped in una lista
grouped_list = list(grouped)
print('State name associated to the first element: ',grouped_list[0][0])
print('Dataframe associated to the first element: ',grouped_list[1][1])
len(np.unique(pointInPolys[group_var]))
if os.path.exists(f'US_mask_by_{group_var}.csv') is False:
mask_list = []
append = mask_list.append
for num,item in enumerate(grouped_list):
print(f'Processing element {num+1} of {len(grouped_list)}...')
sub_df = item[1]
sub_mask = get_state_mask(list(sub_df.lon), list(sub_df.lat), lons_sub[0,:], lats_sub[:,0])
append((item[0],item[1].name.iloc[0],sub_mask))
with open(f'US_mask_by_{group_var}.csv','wb') as fp:
pickle.dump(mask_list,fp)
else:
print(f'The file US_mask_by_{group_var}.csv already exist!')
with open(f'US_mask_by_{group_var}.csv', "rb") as fp: # Unpickling
mask_list= pickle.load(fp)
finer = {}
L = len(times)
T = (np.round(L/100))
if os.path.exists('stats_US.csv') is False:
with Dataset(input_nc_file, mode='r') as fh:
variables = list(fh.variables)
variables.remove('time')
variables.remove('latitude')
variables.remove('longitude')
print(variables)
first_iter = True
for variable in variables:
print('Processing ',variable)
var = np.flip(fh.variables[variable][:],1) # flip to consider the sorting applied to the latitude
ind = 0
for count,day in enumerate(times):
#if (count%T == 0):
#print(f'{count/L*100}% of the days...')
# order=1 means bilinear interpolation
day_formatted = day.strftime("%Y-%m-%d")
interp_data = bmp.interp(var[count,:,:], lons, lats, lons_sub, lats_sub, order=1)
for state_mask in mask_list:
state_data = np.mean(interp_data[state_mask[2][1],state_mask[2][0]])
if first_iter:
index = state_mask[0]
name = state_mask[1]
x_cen = centroids.loc[index]['lon']
y_cen = centroids.loc[index]['lat']
finer[ind] = ((day_formatted,f'{index}',f'{name}',f'{x_cen}',f'{y_cen}',state_data))
else:
finer[ind] = finer[ind] + (state_data,)
ind += 1
first_iter = False
else:
print(f'The file stats_US.csv already exists!')
df_result = pd.read_csv('stats_US.csv')
df_result = df_result.set_index(['date','shp_index','state','lon','lat'])
# Post-processing the dataframe before saving. Setting the indices and the measurement units
if os.path.exists('stats_US.csv') is False:
df_result = pd.DataFrame.from_dict(finer,orient='index')
units = []
for var in variables:
if 'o3' in var:
units.append(' (kg kg^-1)')
elif 'relHum' in var:
units.append( ' (%)')
elif 'specHum' in var:
units.append(' (kg kg^-1)')
elif 'temp' in var:
units.append(' (K)')
elif 'wind' in var:
units.append(' (m s^-1)')
col_names0 = ['date','shp_index','state','lon','lat']
col_names1 = [x+y for x,y in zip(variables,units)]
col_names = col_names0 + col_names1
new_cols = {num:name for num,name in zip(range(len(col_names)),col_names)}
df_result = df_result.rename(columns=new_cols)
df_result = df_result.set_index(col_names0)
df_result = df_result.reindex(sorted(df_result.columns), axis=1)
df_result.to_csv('stats_US.csv')
print(df_result)
else:
print(f'The file stats_US.csv already exists!')
print(df_result)