Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding mmpose #17

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
9cb27ea
support for mmpose on asg
antoinefalisse Jun 5, 2024
7d0f3e5
typo
antoinefalisse Jun 6, 2024
1f2445c
ecs-on-ec2-auto-scaling-updates
antoinefalisse Jun 6, 2024
0859046
fix: fix ECS_CONTAINER_METADATA_FILE is not set
sashasimkin Jun 7, 2024
e802650
fix: set ECS_RESERVED_MEMORY to 768 to make placement more predictable
sashasimkin Jun 7, 2024
09b146e
fix: Scale by one instance, not 2
sashasimkin Jun 7, 2024
92eb747
fix conflicts
antoinefalisse Jun 7, 2024
056f12a
fix conflicts
antoinefalisse Jun 7, 2024
2b64aa5
Merge branch 'feature/ecs-on-ec2-auto-scaling' into feature/ecs-on-ec…
antoinefalisse Jun 7, 2024
ffc0ff0
bring back some changes
antoinefalisse Jun 7, 2024
ad3b532
fix: Adjust memory reservation again
sashasimkin Jun 7, 2024
cdb9761
Merge branch 'feature/ecs-on-ec2-auto-scaling' into feature/ecs-on-ec…
antoinefalisse Jun 7, 2024
bf7d8a2
Merge branch 'feature/ecs-on-ec2-auto-scaling' into feature/ecs-on-ec…
antoinefalisse Jun 11, 2024
736be49
minor
antoinefalisse Jun 11, 2024
59ed567
minor
antoinefalisse Jun 11, 2024
770b7cc
only one GPU
antoinefalisse Jun 11, 2024
7c81950
feat: Implement GPU sharing suggested by GH users
sashasimkin Jun 13, 2024
72c5492
fix: Remove old launch configuration code
sashasimkin Jun 13, 2024
605c11e
wip
antoinefalisse Jun 18, 2024
6c2d55a
fix: Update nvidia GPU sharing
sashasimkin Jun 19, 2024
ba11800
typo
antoinefalisse Jun 21, 2024
0b3697b
fix: Resolt to having only mmpose with GPU available
sashasimkin Jun 23, 2024
7f8ef96
fix: Persist configuration that has mmpose and openpose working together
sashasimkin Jun 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions dev/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ module "processing" {
region = "us-west-2"
num_machines = 0
opencap_ecr_repository = "660440363484.dkr.ecr.us-west-2.amazonaws.com/opencap/opencap-dev"
openpose_ecr_repository = "660440363484.dkr.ecr.us-west-2.amazonaws.com/opencap/openpose"
openpose_ecr_repository = "660440363484.dkr.ecr.us-west-2.amazonaws.com/opencap/openpose-dev"
opencap_api_ecr_repository = "660440363484.dkr.ecr.us-west-2.amazonaws.com/opencap/api-dev"
mmpose_ecr_repository = "660440363484.dkr.ecr.us-west-2.amazonaws.com/opencap/mmpose"
mmpose_ecr_repository = "660440363484.dkr.ecr.us-west-2.amazonaws.com/opencap/mmpose-dev"
opencap_analysis_max_centerofmass_vpos_ecr_repository = "660440363484.dkr.ecr.us-west-2.amazonaws.com/opencap-analysis/max_centerofmass_vpos-dev"
opencap_gait_analysis_ecr_repository = "660440363484.dkr.ecr.us-west-2.amazonaws.com/opencap-analysis/gait_analysis-dev"
opencap_treadmill_gait_analysis_ecr_repository = "660440363484.dkr.ecr.us-west-2.amazonaws.com/opencap-analysis/treadmill_gait_analysis-dev"
Expand All @@ -21,15 +21,13 @@ module "processing" {

processing_asg_scaling_config = {
min_size = 0
max_size = 2
max_size = 0
desired_size = 0
}

processing_asg_scaling_target = 5
processing_asg_trials_baseline = 3

processing_asg_use_launch_config = false

# processing_asg_instance_type = "g5.2xlarge"
# processing_ecs_task_memory = 30146
processing_asg_instance_type = "g5.xlarge"
Expand Down
41 changes: 3 additions & 38 deletions modules/processing/autoscaling.tf
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,14 @@ data "aws_ami" "latest_ecs" {
locals {
lt_user_data_raw = <<-EOF
#!/bin/bash
# ECS cluster settings
echo 'ECS_CLUSTER=${aws_ecs_cluster.ecs_cluster.name}' >> /etc/ecs/ecs.config
echo 'ECS_ENABLE_CONTAINER_METADATA=true' >> /etc/ecs/ecs.config
echo 'ECS_RESERVED_MEMORY=256' >> /etc/ecs/ecs.config
EOF
}

resource "aws_launch_template" "ecs_worker_launch_template" {
count = !var.processing_asg_use_launch_config ? 1 : 0

name_prefix = "${var.app_name}-processing-worker${var.env}"
image_id = data.aws_ami.latest_ecs.image_id
iam_instance_profile {
Expand Down Expand Up @@ -65,12 +64,10 @@ resource "aws_launch_template" "ecs_worker_launch_template" {
}

resource "aws_autoscaling_group" "worker_lt_asg" {
count = !var.processing_asg_use_launch_config ? 1 : 0

name = "${var.app_name}-processing-worker-asg${var.env}"
vpc_zone_identifier = [values(aws_subnet.pub_subnet)[0].id]
launch_template {
id = aws_launch_template.ecs_worker_launch_template[0].id
id = aws_launch_template.ecs_worker_launch_template.id
version = "$Latest"
}

Expand Down Expand Up @@ -103,7 +100,7 @@ resource "aws_ecs_capacity_provider" "worker_lt_gpu_provider" {
name = "${var.app_name}-processing-worker-gpu-capacity${var.env}"

auto_scaling_group_provider {
auto_scaling_group_arn = var.processing_asg_use_launch_config ? aws_autoscaling_group.opencap_processing_asg[0].arn : aws_autoscaling_group.worker_lt_asg[0].arn
auto_scaling_group_arn = aws_autoscaling_group.worker_lt_asg.arn
managed_termination_protection = "DISABLED"

managed_scaling {
Expand Down Expand Up @@ -196,35 +193,3 @@ resource "aws_appautoscaling_policy" "target_tracking" {
}
}
}

### Launch Configuration | Deprecated ###
resource "aws_launch_configuration" "ecs_launch_config" {
count = var.processing_asg_use_launch_config ? 1 : 0

name = "${var.app_name}-processing-workers${var.env}"
image_id = data.aws_ami.latest_ecs.image_id
iam_instance_profile = aws_iam_instance_profile.ecs_agent.name
security_groups = [aws_security_group.ecs_sg.id]
user_data = "#!/bin/bash\necho ECS_CLUSTER=${var.app_name}-processing-cluster >> /etc/ecs/ecs.config"
instance_type = var.processing_asg_instance_type
key_name = aws_key_pair.debug.key_name
associate_public_ip_address = true

root_block_device {
volume_size = 128
}
}

resource "aws_autoscaling_group" "opencap_processing_asg" {
count = var.processing_asg_use_launch_config ? 1 : 0

name = "asg${var.env}"
vpc_zone_identifier = [values(aws_subnet.pub_subnet)[0].id]
launch_configuration = aws_launch_configuration.ecs_launch_config[0].name

desired_capacity = var.num_machines
min_size = 0
max_size = var.num_machines
health_check_grace_period = 300
health_check_type = "EC2"
}
7 changes: 6 additions & 1 deletion modules/processing/ecs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,17 @@ resource "aws_cloudwatch_log_group" "openpose-logs" {
retention_in_days = 90
}

resource "aws_cloudwatch_log_group" "mmpose-logs" {
name = "/ecs/${var.app_name}-mmpose${var.env}"
retention_in_days = 90
}

resource "aws_ecs_task_definition" "task_definition" {
family = "worker${var.env}"
container_definitions = data.template_file.task_definition_template.rendered
execution_role_arn = aws_iam_role.ecs_tasks_execution_role.arn
task_role_arn = aws_iam_role.processing_worker_role.arn

memory = var.processing_ecs_task_memory
volume {
name = "data${var.env}"
Expand Down
81 changes: 70 additions & 11 deletions modules/processing/task_definition.json.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@
{
"name": "DOCKERCOMPOSE",
"value": "1"
},
{
"name": "ECS_ENABLE_CONTAINER_METADATA",
"value": "true"
}
],
"secrets": [
Expand All @@ -36,7 +32,12 @@
"valueFrom": "${API_URL}"
}
],
"resourceRequirements": null,
"resourceRequirements" : [
{
"type" : "GPU",
"value" : "1"
}
],
"ulimits": null,
"dnsServers": [],
"mountPoints": [
Expand Down Expand Up @@ -89,12 +90,11 @@
"command": [],
"linuxParameters": null,
"cpu": 0,
"environment": [],
"resourceRequirements" : [
{
"type" : "GPU",
"value" : "1"
}
"environment": [
{
"name": "NVIDIA_VISIBLE_DEVICES",
"value": "0"
}
],
"ulimits": null,
"dnsServers": [],
Expand Down Expand Up @@ -130,5 +130,64 @@
"systemControls": [],
"privileged": null,
"name": "openpose"
},
{
"dnsSearchDomains": [],
"environmentFiles": null,
"logConfiguration": {
"logDriver": "awslogs",
"secretOptions": null,
"options": {
"awslogs-group": "/ecs/${APP_NAME}-mmpose${ENV}",
"awslogs-region": "${REGION}",
"awslogs-stream-prefix": "ecs"
}
},
"entryPoint": [],
"portMappings": [
],
"command": [],
"linuxParameters": null,
"cpu": 0,
"environment": [
{
"name": "NVIDIA_VISIBLE_DEVICES",
"value": "0"
}
],
"ulimits": null,
"dnsServers": [],
"mountPoints": [
{
"readOnly": false,
"containerPath": "/mmpose/data",
"sourceVolume": "data${ENV}"
}
],
"workingDirectory": null,
"secrets": null,
"dockerSecurityOptions": [],
"memory": null,
"memoryReservation": null,
"volumesFrom": [],
"stopTimeout": null,
"image": "${MMPOSE}",
"startTimeout": null,
"firelensConfiguration": null,
"dependsOn": null,
"disableNetworking": null,
"interactive": null,
"healthCheck": null,
"essential": true,
"links": [],
"hostname": null,
"extraHosts": null,
"pseudoTerminal": null,
"user": null,
"readonlyRootFilesystem": null,
"dockerLabels": {},
"systemControls": [],
"privileged": null,
"name": "mmpose"
}
]
1 change: 1 addition & 0 deletions modules/processing/template_file.tf
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ data "template_file" "task_definition_template" {
ENV = "${var.env}"
APP_NAME = "${var.app_name}"
OPENPOSE = "${var.openpose_ecr_repository}"
MMPOSE = "${var.mmpose_ecr_repository}"
OPENCAP = "${var.opencap_ecr_repository}"
API_TOKEN = var.env == "-dev" ? "arn:aws:secretsmanager:us-west-2:660440363484:secret:OpenCapProcessingCredentials-oXYoTR:api_token_dev::" : "arn:aws:secretsmanager:us-west-2:660440363484:secret:OpenCapProcessingCredentials-oXYoTR:api_token::"
API_URL = var.env == "-dev" ? "arn:aws:secretsmanager:us-west-2:660440363484:secret:OpenCapProcessingCredentials-oXYoTR:api_url_dev::" : "arn:aws:secretsmanager:us-west-2:660440363484:secret:OpenCapProcessingCredentials-oXYoTR:api_url::"
Expand Down
9 changes: 0 additions & 9 deletions modules/processing/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,6 @@ variable "num_machines" {
default = 1
}

variable "processing_asg_use_launch_config" {
default = true
description =<<-EOF
true: Use launch config for processing autoscaling group, false: use launch template
This is for providing a smooth transition from launch config to launch template as LCs are deprecated
the goal is to delete this variable along with LC code when all services are using launch templates
EOF
}

variable "processing_asg_scaling_config" {
default = {
min_size = 0
Expand Down