Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Erikk/ban 375 inference time to potassium #33

Merged
merged 7 commits into from
Sep 16, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions potassium/potassium.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import time
from flask import Flask, request, make_response, abort
from werkzeug.serving import make_server
from threading import Thread, Lock, Condition
Expand Down Expand Up @@ -47,6 +48,8 @@ def __init__(self, name):
self._gpu_lock = Lock()
self._background_task_cv = Condition()
self._sequence_number = 0
self._idle_start_time = 0
self._inference_start_time = 0
self._flask_app = self._create_flask_app()

#
Expand Down Expand Up @@ -145,6 +148,7 @@ def _handle_generic(self, endpoint, flask_request):
return res

res = None
self._inference_start_time = time.time()

if endpoint.type == "handler":
req = Request(
Expand All @@ -162,6 +166,7 @@ def _handle_generic(self, endpoint, flask_request):
res = make_response(tb_str)
res.status_code = 500
res.headers['X-Endpoint-Type'] = endpoint.type
self._idle_start_time = time.time()
self._gpu_lock.release()
elif endpoint.type == "background":
req = Request(
Expand All @@ -178,7 +183,8 @@ def task(endpoint, lock, req):
finally:
with self._background_task_cv:
self._background_task_cv.notify_all()
# release lock

self._idle_start_time = time.time()
lock.release()

thread = Thread(target=task, args=(endpoint, self._gpu_lock, req))
Expand Down Expand Up @@ -219,14 +225,22 @@ def handle(path):

@flask_app.route('/__status__', methods=["GET"])
def status():
idle_time = 0
inference_time = int((time.time() - self._inference_start_time)*1000)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't inference_time be 0 if there's no inference currently running? Right not if i'm reading this correct it will pretty much always be > 0

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

worth noting that there's a minor race condition to watch out for when implementing this as well since there will be a period where either inference start time hasn't been updated or the lock hasn't been acquired yet

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question, depends if we want to calculate the cold boot into the inference time. Here I opted to include it with the reasoning that if someone (for some weird reason) has a cold boot which is very long or is in an infinite loop. The timeout would still kick in and kill the replica. So from a user-point-of-view. "I set this value in the UI to 15min" => "not possible to have a replica running longer than 15min and thus not getting billed for more than 15min".

Does this sound reasonable?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

offline discussion: let's move this to the be only around the inference itself and not the cold boot.

gpu_available = not self._gpu_lock.locked()

if gpu_available:
idle_time = int((time.time() - self._idle_start_time)*1000)

res = make_response({
"gpu_available": not self._gpu_lock.locked(),
"sequence_number": self._sequence_number
"gpu_available": gpu_available,
"sequence_number": self._sequence_number,
"idle_time": idle_time,
"inference_time": inference_time,
})

res.status_code = 200
res.headers['X-Endpoint-Type'] = "status"
res
return res

return flask_app
Expand All @@ -235,6 +249,8 @@ def status():
def serve(self, host="0.0.0.0", port=8000):
print(colored("------\nStarting Potassium Server 🍌", 'yellow'))
self._init_func()
server = make_server(host, port, self._flask_app)
server = make_server(host, port, self._flask_app, threaded=True)
print(colored(f"Serving at http://{host}:{port}\n------", 'green'))
self._idle_start_time = time.time()
self._inference_start_time = time.time()
server.serve_forever()
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from distutils.core import setup
import setuptools
from pathlib import Path

this_directory = Path(__file__).parent
Expand Down
30 changes: 18 additions & 12 deletions tests/test_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,23 +95,28 @@ def background(context: dict, request: potassium.Request):
res = client.get("/__status__", json={})

assert res.status_code == 200
assert res.json == {
"gpu_available": True,
"sequence_number": 0,
}
assert res.json is not None
assert res.json["gpu_available"] == True
assert res.json["sequence_number"] == 0
assert res.json["idle_time"] > 0
assert res.json["inference_time"] > 0

# send background post in separate thread
res = client.post("/background", json={})
assert res.status_code == 200

# add a small sleep for inference time to be above 0
time.sleep(0.1)

# check status
res = client.get("/__status__", json={})

assert res.status_code == 200
assert res.json == {
"gpu_available": False,
"sequence_number": 1,
}
assert res.json is not None
assert res.json["gpu_available"] == False
assert res.json["sequence_number"] == 1
assert res.json["idle_time"] == 0
assert res.json["inference_time"] > 0

# notify background thread to continue
with resolve_background_condition:
Expand All @@ -124,10 +129,11 @@ def background(context: dict, request: potassium.Request):
res = client.get("/__status__", json={})

assert res.status_code == 200
assert res.json == {
"gpu_available": True,
"sequence_number": 1,
}
assert res.json is not None
assert res.json["gpu_available"] == True
assert res.json["sequence_number"] == 1
assert res.json["idle_time"] > 0
assert res.json["inference_time"] > 0

def test_wait_for_background_task():
app = potassium.Potassium("my_app")
Expand Down
Loading