diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index e5f3d1104a..af1952892f 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -171,14 +171,10 @@ def test_sdk_e2e(job_namespace): GANG_SCHEDULER_NAME in GANG_SCHEDULERS, reason="For plain scheduling", ) -def test_sdk_e2e_managed_by_multikueue(job_namespace): +def test_sdk_e2e_managed_by(job_namespace): JOB_NAME = "pytorchjob-e2e" container = generate_container() - #1. Job created with default value - normal procedure - #2. Job created with external valid value - created but not updated at all - #3. Job created with external invalid value - not created - master = KubeflowOrgV1ReplicaSpec( replicas=1, restart_policy="OnFailure", @@ -201,17 +197,33 @@ def test_sdk_e2e_managed_by_multikueue(job_namespace): ), ) - managed_by = 'kueue.x-k8s.io/multikueue' - pytorchjob = generate_pytorchjob(job_namespace, JOB_NAME, master, worker, managed_by=managed_by) - - TRAINING_CLIENT.create_job(job=pytorchjob, namespace=job_namespace) + #1. Job created with default value: 'kubeflow.org/training-operator' - job created and status updated + #2. Job created with kueue value: 'kueue.x-k8s.io/multikueue' - job created but status not updated + #3. Job created with invalid value (not acceptable by the webhook) - job not created + controllers = { + JOB_NAME+"-default-controller": 'kubeflow.org/training-operator', + JOB_NAME+"-multikueue-controller": 'kueue.x-k8s.io/multikueue', + JOB_NAME+"-invalid-controller": 'kueue.x-k8s.io/other-controller', + } + for job_name, managed_by in controllers.items(): + pytorchjob = generate_pytorchjob(job_namespace, job_name, master, worker, managed_by=managed_by) + TRAINING_CLIENT.create_job(job=pytorchjob, namespace=job_namespace) + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") - logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) + jobs = TRAINING_CLIENT.list_jobs(job_namespace) + logging.info(jobs) try: - conditions = TRAINING_CLIENT.get_job_conditions(JOB_NAME, job_namespace, TRAINING_CLIENT.job_kind, pytorchjob) - if len(conditions) != 0: - raise Exception(f"PyTorchJob conditions {conditions} should not be updated, externally managed by {managed_by}") + #Only jobs with valid controllers should be created, 2 out of 3 satisfy this condition: 'kubeflow.org/training-operator' and 'kueue.x-k8s.io/multikueue' + if len(jobs) != 2: + raise Exception(f"Too many PyTorchJobs created {jobs}") + + for job in jobs: + if job._metadata.name == 'kubeflow.org/training-operator': + utils.verify_job_e2e(TRAINING_CLIENT, job._metadata.name, job_namespace, wait_timeout=900) + + if job._metadata.name == 'kueue.x-k8s.io/multikueue': + utils.verify_externally_managed_job_e2e(TRAINING_CLIENT, job._metadata.name, job_namespace) except Exception as e: utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) diff --git a/sdk/python/test/e2e/utils.py b/sdk/python/test/e2e/utils.py index 7a6f81f922..82185f1eb8 100644 --- a/sdk/python/test/e2e/utils.py +++ b/sdk/python/test/e2e/utils.py @@ -11,6 +11,15 @@ logging.getLogger().setLevel(logging.INFO) +def verify_externally_managed_job_e2e(client: TrainingClient, name: str, namespace: str): + """Verify externally managed Training Job e2e test.""" + logging.info(f"\n\n\n{client.job_kind} is created") + job = client.get_job(name, namespace) + conditions = client.get_job_conditions(name, namespace, client.job_kind, job) + if len(conditions) != 0: + raise Exception(f"{client.job_kind} conditions {conditions} should not be updated, externally managed by {managed_by}") + + def verify_unschedulable_job_e2e(client: TrainingClient, name: str, namespace: str): """Verify unschedulable Training Job e2e test.""" logging.info(f"\n\n\n{client.job_kind} is creating")