Merge pull request #19192 from edx/youngstrom/fix-rate-and-max-errors
Fix ratelimiting issue and account for other ecs failures
This commit is contained in:
@@ -3,6 +3,7 @@ import logging
|
||||
import time
|
||||
|
||||
import boto3
|
||||
from botocore.config import Config
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
@@ -13,9 +14,16 @@ class PytestContainerManager():
|
||||
"""
|
||||
Responsible for spinning up and terminating ECS tasks to be used with pytest-xdist
|
||||
"""
|
||||
TASK_RUN_TIMEOUT_MINUTES = 10
|
||||
MAX_RUN_TASK_RETRIES = 7
|
||||
|
||||
def __init__(self, region, cluster):
|
||||
self.ecs = boto3.client('ecs', region)
|
||||
config = Config(
|
||||
retries={
|
||||
'max_attempts': self.MAX_RUN_TASK_RETRIES
|
||||
}
|
||||
)
|
||||
self.ecs = boto3.client('ecs', region, config=config)
|
||||
self.cluster_name = cluster
|
||||
|
||||
def spin_up_tasks(self, number_of_tasks, task_name, subnets, security_groups, public_ip, launch_type):
|
||||
@@ -23,9 +31,6 @@ class PytestContainerManager():
|
||||
Spins up tasks and generates two .txt files, containing the IP/ arns
|
||||
of the new tasks.
|
||||
"""
|
||||
TASK_RUN_TIMEOUT_MINUTES = 10
|
||||
MAX_RUN_TASK_RETRIES = 7
|
||||
|
||||
revision = self.ecs.describe_task_definition(taskDefinition=task_name)['taskDefinition']['revision']
|
||||
task_definition = "{}:{}".format(task_name, revision)
|
||||
|
||||
@@ -41,7 +46,7 @@ class PytestContainerManager():
|
||||
# Spin up tasks. boto3's run_task only allows 10 tasks to be launched at a time
|
||||
task_arns = []
|
||||
for num in task_num_list:
|
||||
for retry in range(1, MAX_RUN_TASK_RETRIES + 1):
|
||||
for retry in range(1, self.MAX_RUN_TASK_RETRIES + 1):
|
||||
try:
|
||||
response = self.ecs.run_task(
|
||||
count=num,
|
||||
@@ -58,9 +63,9 @@ class PytestContainerManager():
|
||||
)
|
||||
except ClientError as err:
|
||||
# Handle AWS throttling with an exponential backoff
|
||||
if retry == MAX_RUN_TASK_RETRIES:
|
||||
if retry == self.MAX_RUN_TASK_RETRIES:
|
||||
raise StandardError(
|
||||
"MAX_RUN_TASK_RETRIES ({}) reached while spinning up tasks due to AWS throttling.".format(MAX_RUN_TASK_RETRIES)
|
||||
"MAX_RUN_TASK_RETRIES ({}) reached while spinning up tasks due to AWS throttling.".format(self.MAX_RUN_TASK_RETRIES)
|
||||
)
|
||||
logger.info("Hit error: {}. Retrying".format(err))
|
||||
countdown = 2 ** retry
|
||||
@@ -72,11 +77,17 @@ class PytestContainerManager():
|
||||
for task_response in response['tasks']:
|
||||
task_arns.append(task_response['taskArn'])
|
||||
|
||||
failure_array = response['failures']
|
||||
if failure_array:
|
||||
raise StandardError(
|
||||
"There was at least one failure when spinning up tasks: {}".format(failure_array)
|
||||
)
|
||||
|
||||
# Wait for tasks to finish spinning up
|
||||
not_running = task_arns[:]
|
||||
ip_addresses = []
|
||||
all_running = False
|
||||
for attempt in range(0, TASK_RUN_TIMEOUT_MINUTES * 2):
|
||||
for attempt in range(0, self.TASK_RUN_TIMEOUT_MINUTES * 2):
|
||||
time.sleep(30)
|
||||
list_tasks_response = self.ecs.describe_tasks(cluster=self.cluster_name, tasks=not_running)['tasks']
|
||||
del not_running[:]
|
||||
|
||||
Reference in New Issue
Block a user