Skip to content

Commit

Permalink
Merge branch 'RESTAPI-1194-random-error-on-task-creation' into 'master'
Browse files Browse the repository at this point in the history
Added retry policy on task creation

See merge request firecrest/firecrest!316
  • Loading branch information
Juan Pablo Dorsch committed Aug 27, 2024
2 parents ed3c38d + 0978001 commit e96570f
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 11 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Variable `F7T_REALM_RSA_PUBLIC_KEYS` changed to `F7T_AUTH_PUBLIC_KEYS`
- Variable `F7T_REALM_RSA_TYPE_` changed to `F7T_AUTH_ALGORITHMS`
- Added default values on helm charts
- Upgrade `requests` library to version `2.32.0`


### Fixed

- Fix parsing in `GET /utilities/ls` endpoint.
- The job fields `job_data_out` and `job_file_err` from `GET /compute/jobs` will be empty for jobs that are still pending (so that there is no confusion with older output/error files).
- Added retry on task creation workflow

## [1.16.0]

Expand Down
2 changes: 1 addition & 1 deletion deploy/docker/base/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ cryptography==42.0.4
markupsafe==2.1.2
Flask==2.3.2
PyJWT==2.4.0
requests==2.22.0
requests==2.32.0
jaeger_client==4.5.0
Flask-Opentracing==1.1.0
gunicorn==22.0.0
Expand Down
54 changes: 44 additions & 10 deletions src/common/cscs_api_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import time
import threading
import sys
from time import sleep

from typing import Union

Expand Down Expand Up @@ -659,17 +660,50 @@ def create_task(headers, service=None, system=None, init_data=None) -> Union[str
'''

# returns {"task_id":task_id}
# first try to get up task microservice:
try:
# X-Firecrest-Service: service that created the task
headers["X-Firecrest-Service"] = service
headers["X-Machine-Name"] = system
req = requests.post(f"{TASKS_URL}/", data={"init_data": init_data}, headers=headers, verify=(SSL_CRT if SSL_ENABLED else False))
# first try to create a task:
# this might fail due to server error,
# here we're setting a retry policy before sending an error
# due to ChunkedEncodingError exception
retries = 5
delay = 3
for n in range(1, retries+1):
try:
# X-Firecrest-Service: service that created the task
headers["X-Firecrest-Service"] = service
headers["X-Machine-Name"] = system
req = requests.post(f"{TASKS_URL}/", data={"init_data": init_data},
headers=headers,
verify=(SSL_CRT if SSL_ENABLED else False))
break
except requests.exceptions.ChunkedEncodingError as cee:
if retries > n:
logging.warning(f"Task creation warning: '{service}' is not "
"able to reach 'tasks'")
logging.warning(type(cee), exc_info=True)
logging.warning(f"We'll try {retries-n} more times.")
logging.warning(f"Attempting in {delay} seconds...")
sleep(delay)
continue
else:
logging.error(f"Task creation error: '{service}' was not able "
"to reach 'tasks'")
logging.error("All attempts have been exhausted. "
"Sending error message to user")
logging.error(type(cee), exc_info=True)
return -1

except requests.exceptions.ConnectionError as ce:
logging.error(f"Task creation error: '{service}' was not able "
"to reach 'tasks'")
logging.error(type(ce), exc_info=True)
logging.error(ce)
return -1

except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
logging.error(type(e), exc_info=True)
logging.error(e)
return -1
except Exception as e:
logging.error(f"Task creation error")
logging.error(type(e), exc_info=True)
logging.error(e)
return -1

if req.status_code != 201:
return -1
Expand Down

0 comments on commit e96570f

Please sign in to comment.