This repository has been archived by the owner on Jul 25, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
ReadingData.py
191 lines (143 loc) · 6.3 KB
/
ReadingData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# -*- coding: utf-8 -*-
"""
Created on Wed May 19 10:41:14 2021
author: Jasper Dijkstra
This script contains functions that read input datasets and return as pd.df or np.array
For example, data specific:
- Population Density: https://ec.europa.eu/eurostat/web/products-datasets/-/demo_r_d3dens
Or general function:
- ReadNC: Reads an input NetCDF file and returns lat, lon and data.
"""
import numpy as np
import pandas as pd
import netCDF4 as nc
def ReadPopulationDensityTSV(tsv_path, zones_df, area_df, geo_name = "NUTS_ID", years = list(np.arange(2001, 2019, 1))):
"""
Parameters
----------
tsv_path : str
Path to the tsv file, which can be found via:
https://ec.europa.eu/eurostat/web/products-datasets/-/demo_r_d3dens
zones_df : pandas.DataFrame
DataFrame containing the zones to which the data has to be filtered.
area_df : pandas.DataFrame
DataFrame containing the area (in km2) of all regions to which has to be filtered.
geo_name : str, optional
Name of the column containing the geolocation. The default is "NUTS_ID".
years : int, list, optional
The year(s) for which data has to be returned. This can be either an integer or a list with integers.
The default is 2001-2018, since this is the time range in which the EFD contains data.
Returns
-------
df : pandas.DataFrame
DataFrame containing total population and population density for each specified year.
"""
# Get the years variable in desired format:
assert isinstance(years, int) or isinstance(years, list), \
"The 'years' parameter should be a list or int, not {}.".format(type(years))
if isinstance(years, int):
years = [f"{years} "]
else:
inputyears = years.copy()
years = []
for year in inputyears:
years.append(f"{year} ")
# Open the tsv file
df = pd.read_csv(tsv_path, delimiter="\t")
# Identify geo id's
headers = df.iloc[:,0].name.split(',')
left_cols = pd.DataFrame()
left_cols[headers] = df.iloc[:,0].str.split(",", expand = True)
geo_col = [col for col in left_cols if col.startswith('geo')]
# Now append geo id's to df
df = df.drop(df.columns[0], axis = 1) # Drop original
df = pd.merge(left_cols[geo_col], df, left_index = True, right_index=True)
df.rename(columns={geo_col[0]:geo_name}, inplace = True, errors = 'raise')
# Filter to the required NUTS regions
regions = list(zones_df[geo_name]) # List required NUTS regions
df = df[df[geo_name].isin(regions)] # Remove those that do not occur in the list
# Filter to the required years
df = pd.concat([df[df.columns[:2]], df[years]], axis = 1)
# Add a NUTS area component to the df
df = pd.merge(df, area_df, on=[geo_name])
area_name = list(area_df.columns)[-1]
# Apply the following steps for each year column in the df:
for year in years:
# Remove all non-number characters (except ".")
df[year] = df[year].str.replace(r"[^.0-9\s]", "", regex = True)
# Convert all str population density values to a number
df[year] = df[year].apply(pd.to_numeric, errors = 'coerce')
# Get the total population
df.insert(loc = df.columns.get_loc(year)+1, column = f"Total{year}", value = df[year]*df[area_name])
# Rename Year Column, for more clarity
df.rename(columns = {year : f"Dens{year}"}, inplace = True)
# Now get the mean over all years:
denscols = [col for col in df if col.startswith('Dens')]
df["MeanDens"] = df[denscols].mean(axis = 1)
totalcols = [col for col in df if col.startswith('Total')]
df["MeanTotal"] = df[totalcols].mean(axis = 1)
return df
# def ReadMODIS_MCD64A1(csv_path):
# """
# Parameters
# ----------
# csv_path : str
# Path to .csv file that results from the Google Earth Engine Script:
# https://code.earthengine.google.com/84d4275b0e18ab06912db61f430259ac
# Returns
# -------
# pd.DataFrame
# DataFrame with total burned area per year and the mean, stdev and ba_cv
# of the annual burned area per NUTS3 geometry in the European Fire Emissions Database
# """
# # Import the csv file as a pd.DataFrame
# df = pd.read_csv(csv_path, delimiter = ',')
# # Remove unneccesary columns
# df.drop(df.columns[len(df.columns)-1], axis=1, inplace=True)
# del df['system:index']
# # Get the area from m2 to km2
# df['sum'] = df['sum'] * 1e-06
# # Create new df with all unique NUTS regions
# ba_df = df['NUTS_ID'].drop_duplicates()
# # Now filter all annual data per column
# for year in range(int(df['year'].min()), int(df['year'].max())+1):
# year_df = df.loc[(df["year"] == year)]
# year_df = year_df[["NUTS_ID", "sum"]]
# ba_df = pd.merge(ba_df, year_df, on=['NUTS_ID'])
# ba_df = ba_df.rename(columns={'sum': str(year)})
# # Calculate the coefficient of variation
# ba_df['mean'] = ba_df.mean(axis = 1)
# ba_df['stdev'] = ba_df.std(axis = 1)
# ba_df['BA_CV'] = np.divide(ba_df['stdev'], ba_df['mean'])#,
# #out = np.zeros(ba_df['stdev'].shape, dtype=float), where = ba_df['mean'] != 0)
# return ba_df
def ReadNC(path, variable):
"""
General function to read NetCDF file
Parameters
----------
path : str
Path to the nc file.
variable : str
The name of the variable in the file that contains the data values of interest.
Returns
-------
lon : np.array
1D Array with all longitude coordinates of the grid cells.
lat : np.array
1D Array with all latitude coordinates of the grid cells..
data : np.array
2D array of size [lat, lon] that describes the value of the grid cells.
"""
# Open dataset
fid = nc.Dataset(path)
# Get Coordinates
lon = np.ma.filled(fid.variables['Longitude'][:], np.nan)
lat = np.ma.filled(fid.variables['Latitude'][:], np.nan)
# Get Dataset of interest
data = np.ma.filled(fid.variables[variable][:], np.nan)
# Make the data the correct way up
data = np.flipud(data)
# Close the dataset
fid.close()
return lon, lat, data