-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.go
187 lines (174 loc) · 5.41 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
package main
import (
"context"
"fmt"
"github.com/HyperGAI/serving-agent/api"
"github.com/HyperGAI/serving-agent/platform"
"github.com/HyperGAI/serving-agent/utils"
"github.com/HyperGAI/serving-agent/worker"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
"golang.org/x/sys/unix"
"net/http"
"os"
"os/signal"
"time"
)
func main() {
utils.InitZerolog()
config, err := utils.LoadConfigs(".")
if err != nil {
log.Fatal().Err(err).Msg("cannot load config")
}
if config.Environment == "development" {
log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr})
}
PreCheck(config)
// Initialize ML platform service
var service platform.Platform
if config.MLPlatform == "kserve" {
log.Info().Msg(fmt.Sprintf("using KServe platform: %s", config.KServeAddress))
service = platform.NewKServe(config)
} else if config.MLPlatform == "replicate" {
log.Info().Msg(fmt.Sprintf("using Replicate platform: %s, %s",
config.ReplicateAddress, config.ReplicateModelID))
service = platform.NewReplicate(config)
} else if config.MLPlatform == "runpod" {
log.Info().Msg(fmt.Sprintf("using RunPod platform: %s, %s",
config.RunPodAddress, config.RunPodModelID))
service = platform.NewRunPod(config)
} else if config.MLPlatform == "k8s" || config.MLPlatform == "k8s-plugin" {
log.Info().Msg(fmt.Sprintf("using k8s deployment: %s", config.K8sPluginAddress))
service = platform.NewK8sPlugin(config)
} else {
log.Fatal().Msg("ML platform is not set")
}
webhook := platform.NewInternalWebhook(config)
distributor := worker.NewRedisTaskDistributor(config)
/*
// Start task processor
go runTaskProcessor(config, service, webhook)
// Start model API server
runGinServer(config, service, distributor, webhook)
*/
runServer(config, service, distributor, webhook)
}
func PreCheck(config utils.Config) {
if config.MaxQueueSize < 1 {
log.Fatal().Msg("MaxQueueSize must be > 0")
}
if config.TaskTimeout < config.KServeRequestTimeout ||
config.TaskTimeout < config.K8sPluginRequestTimeout ||
config.TaskTimeout < config.ReplicateRequestTimeout ||
config.TaskTimeout < config.RunPodRequestTimeout {
log.Fatal().Msg("timeout setting error: TaskTimeout must be >= [Platform]RequestTimeout")
}
}
func runGinServer(
config utils.Config,
platform platform.Platform,
distributor worker.TaskDistributor,
webhook platform.Webhook,
) {
server, err := api.NewServer(config, platform, distributor, webhook)
if err != nil {
log.Fatal().Err(err).Msg("cannot create server")
}
err = server.Start(config.HTTPServerAddress)
if err != nil {
log.Fatal().Err(err).Msg("cannot start server")
}
}
func runTaskProcessor(config utils.Config, platform platform.Platform, webhook platform.Webhook) {
if config.RedisAddress == "" {
log.Fatal().Msg("redis address is not set")
}
taskProcessor := worker.NewRedisTaskProcessor(config, platform, webhook)
log.Info().Msg("start task processor")
err := taskProcessor.Start()
if err != nil {
log.Fatal().Err(err).Msg("failed to start task processor")
}
}
func runServer(
config utils.Config,
platform platform.Platform,
distributor worker.TaskDistributor,
webhook platform.Webhook,
) {
// Start the Gin server
server, err := api.NewServer(config, platform, distributor, webhook)
if err != nil {
log.Fatal().Err(err).Msg("cannot create server")
}
httpServer := &http.Server{
Addr: config.HTTPServerAddress,
Handler: server.Handler(),
}
go func() {
if err := httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed {
log.Fatal().Err(err).Msg("cannot start server")
}
}()
// Check task queue size
go func() {
server.CheckQueueSize()
}()
// Start the Asynq server
if config.RedisAddress == "" {
log.Fatal().Msg("redis address is not set")
}
taskProcessor := worker.NewRedisTaskProcessor(config, platform, webhook)
log.Info().Msg("start task processor")
go func() {
if err := taskProcessor.Start(); err != nil {
log.Fatal().Err(err).Msg("failed to start task processor")
}
}()
// Start checking the archived tasks
go func() {
for {
if config.EnablePeriodicCheck {
api.PeriodicCheck(config, distributor, webhook)
}
time.Sleep(30 * time.Minute)
}
}()
// Shutdown the Asynq server
// https://pkg.go.dev/github.com/hibiken/asynq#example-Server.Shutdown
sigs := make(chan os.Signal, 1)
signal.Notify(sigs, unix.SIGTERM, unix.SIGINT, unix.SIGTSTP)
// Handle SIGTERM, SIGINT to exit the program.
// Handle SIGTSTP to stop processing new tasks.
for {
s := <-sigs
if s == unix.SIGTSTP {
taskProcessor.Stop() // stop processing new tasks
continue
}
break // received SIGTERM or SIGINT signal
}
// If not in redis cluster mode and use local redis, run the full shutdown
if !config.RedisClusterMode && config.UseLocalRedis {
if config.ShutdownDelay > 0 {
// Wait for `ShutdownDelay` seconds
log.Info().Msgf("waiting for %d seconds", config.ShutdownDelay)
time.Sleep(time.Duration(config.ShutdownDelay) * time.Second)
}
worker.ShutdownDistributor(distributor, webhook)
}
taskProcessor.Shutdown()
// Shutdown the Gin server
// https://gin-gonic.com/docs/examples/graceful-restart-or-stop/
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := httpServer.Shutdown(ctx); err != nil {
log.Fatal().Err(err).Msg("server shutdown")
}
// catching ctx.Done(). timeout of 5 seconds.
select {
case <-ctx.Done():
log.Info().Msg("timeout of 5 seconds")
}
log.Info().Msg("server exiting")
}