Skip to content

Commit

Permalink
Revise retrieve_databundle to download latest protectedplanet data
Browse files Browse the repository at this point in the history
  • Loading branch information
davide-f committed Dec 9, 2023
1 parent 9dfcd13 commit 6ecd9a9
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 34 deletions.
2 changes: 1 addition & 1 deletion configs/bundle_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -324,5 +324,5 @@ databundles:
category: landcover
destination: "data/landcover/world_protected_areas"
urls:
protectedplanet: https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_Oct2023_Public_shp.zip
protectedplanet: https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_{month:s}{year:d}_Public_shp.zip
output: [data/landcover/world_protected_areas/*]
110 changes: 77 additions & 33 deletions scripts/retrieve_databundle_light.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
urls: # list of urls by source, e.g. zenodo or google
zenodo: {zenodo url} # key to download data from zenodo
gdrive: {google url} # key to download data from google drive
protectedplanet: {url} # key to download data from protected planet
protectedplanet: {url} # key to download data from protected planet; the url can contain {month:s} and {year:d} to let the workflow specify the current month and year
direct: {url} # key to download data directly from a url; if unzip option is enabled data are unzipped
post: # key to download data using an url post request; if unzip option is enabled data are unzipped
url: {url}
Expand Down Expand Up @@ -80,6 +80,7 @@
- ``cutouts``: input data unzipped into the cutouts folder
"""
import datetime as dt
import os
import re
from zipfile import ZipFile
Expand Down Expand Up @@ -236,7 +237,7 @@ def download_and_unzip_gdrive(config, rootpath, hot_run=True, disable_progress=F


def download_and_unzip_protectedplanet(
config, rootpath, hot_run=True, disable_progress=False
config, rootpath, attempts=3, hot_run=True, disable_progress=False
):
"""
download_and_unzip_protectedplanet(config, rootpath, dest_path,
Expand All @@ -250,6 +251,9 @@ def download_and_unzip_protectedplanet(
Configuration data for the category to download
rootpath : str
Absolute path of the repository
attempts : int (default 3)
Number of attempts to download the data by month.
The download is attempted for the current and previous months according to the number of attempts
hot_run : Bool (default True)
When true the data are downloaded
When false, the workflow is run without downloading and unzipping
Expand All @@ -265,51 +269,91 @@ def download_and_unzip_protectedplanet(

url = config["urls"]["protectedplanet"]

def get_first_day_of_month(date):
return date.replace(day=1)

def get_first_day_of_previous_month(date):
return get_first_day_of_month(date - dt.timedelta(days=1))

current_first_day = get_first_day_of_month(dt.datetime.today())

if hot_run:
if os.path.exists(file_path):
os.remove(file_path)

try:
logger.info(f"Downloading resource '{resource}' from cloud '{url}'.")
progress_retrieve(url, file_path, disable_progress=disable_progress)

zip_obj = ZipFile(file_path, "r")
downloaded = False

# list of zip files, which contains the shape files
zip_files = [
fname for fname in zip_obj.namelist() if fname.endswith(".zip")
]
for i in range(attempts + 1):
# customize url to current month
month_MMM = current_first_day.strftime("%b")
year_YYYY = current_first_day.year
url_iter = url.format(month=month_MMM, year=year_YYYY)

# extract the nested zip files
for fzip in zip_files:
# final path of the file
try:
inner_zipname = os.path.join(config["destination"], fzip)
resource_iter = resource + " - " + month_MMM + " " + str(year_YYYY)

zip_obj.extract(fzip, path=config["destination"])
try:
logger.info(
f"Downloading resource '{resource_iter}' from cloud '{url}'."
)
progress_retrieve(
url_iter, file_path, disable_progress=disable_progress
)

with ZipFile(inner_zipname, "r") as nested_zip:
nested_zip.extractall(path=config["destination"])
zip_obj = ZipFile(file_path, "r")

# remove inner zip file
os.remove(inner_zipname)
# list of zip files, which contains the shape files
zip_files = [
fname for fname in zip_obj.namelist() if fname.endswith(".zip")
]

logger.info(f"{resource} - Successfully unzipped file '{fzip}'")
except:
logger.warning(
f"Exception while unzipping file '{fzip}' for {resource}: skipped"
# if empty, the download failed
if not zip_files:
raise Exception(
"Corrupted zip file downloaded from protectedplanet"
)

# close and remove outer zip file
zip_obj.close()
os.remove(file_path)
# extract the nested zip files
for fzip in zip_files:
# final path of the file
try:
inner_zipname = os.path.join(config["destination"], fzip)

logger.info(f"Downloaded resource '{resource}' from cloud '{url}'.")
except:
logger.warning(f"Failed download resource '{resource}' from cloud '{url}'.")
return False
zip_obj.extract(fzip, path=config["destination"])

return True
with ZipFile(inner_zipname, "r") as nested_zip:
nested_zip.extractall(path=config["destination"])

# remove inner zip file
os.remove(inner_zipname)

logger.info(f"{resource} - Successfully unzipped file '{fzip}'")
except:
logger.warning(
f"Exception while unzipping file '{fzip}' for {resource_iter}: skipped file"
)

# close and remove outer zip file
zip_obj.close()
os.remove(file_path)

logger.info(
f"Downloaded resource '{resource_iter}' from cloud '{url_iter}'."
)

downloaded = True
break
except:
logger.warning(
f"Failed download resource '{resource_iter}' from cloud '{url_iter}'."
)
current_first_day = get_first_day_of_previous_month(current_first_day)

if not downloaded:
logger.warning(
f"All attempts ({attempts}) to download resource '{resource}' from protected planet failed."
)

return downloaded


def download_and_unpack(
Expand Down

0 comments on commit 6ecd9a9

Please sign in to comment.