[OpenAI] Add usage to streaming, add prefill and decode speed to usage

mlc-ai · Jun 4, 2024 · a9a34cb · a9a34cb
1 parent 22f0d37
commit a9a34cb
Show file tree

Hide file tree

Showing 13 changed files with 119 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -162,7 +162,7 @@ const reply = await engine.chat.completions.create({
   messages,
 });
 console.log(reply.choices[0].message);
-console.log(await engine.runtimeStatsText());
+console.log(reply.usage);
 ```
 
 ### Streaming
@@ -183,14 +183,16 @@ const chunks = await engine.chat.completions.create({
 });
 
 let reply = "";
+let lastChunk: webllm.ChatCompletionChunk | undefined = undefined;
 for await (const chunk of chunks) {
   reply += chunk.choices[0].delta.content || "";
+  lastChunk = chunk;
   console.log(reply);
 }
 
 const fullReply = await engine.getMessage()
 console.log(fullReply);
-console.log(await engine.runtimeStatsText());
+console.log(lastChunk.usage);
 ```
 
 ## Advanced Usage

diff --git a/examples/function-calling/src/function_calling.ts b/examples/function-calling/src/function_calling.ts
@@ -55,6 +55,7 @@ async function main() {
   if (!request.stream) {
     const reply0 = await engine.chat.completions.create(request);
     console.log(reply0.choices[0]);
+    console.log(reply0.usage);
   } else {
     // If streaming, the last chunk returns tool calls
     const asyncChunkGenerator = await engine.chat.completions.create(request);
@@ -70,9 +71,8 @@ async function main() {
       lastChunk = chunk;
     }
     console.log(lastChunk!.choices[0].delta);
+    console.log(lastChunk!.usage);
   }
-
-  console.log(await engine.runtimeStatsText());
 }
 
 main();
diff --git a/examples/get-started-web-worker/src/main.ts b/examples/get-started-web-worker/src/main.ts
@@ -46,7 +46,7 @@ async function mainNonStreaming() {
   const reply0 = await engine.chat.completions.create(request);
   console.log(reply0);
 
-  console.log(await engine.runtimeStatsText());
+  console.log(reply0.usage);
 }
 
 /**
@@ -84,17 +84,22 @@ async function mainStreaming() {
 
   const asyncChunkGenerator = await engine.chat.completions.create(request);
   let message = "";
+  let lastChunk: webllm.ChatCompletionChunk | undefined = undefined;
   for await (const chunk of asyncChunkGenerator) {
     console.log(chunk);
     if (chunk.choices[0].delta.content) {
       // Last chunk has undefined content
       message += chunk.choices[0].delta.content;
     }
     setLabel("generate-label", message);
+    lastChunk = chunk;
     // engine.interruptGenerate();  // works with interrupt as well
   }
   console.log("Final message:\n", await engine.getMessage()); // the concatenated message
-  console.log(await engine.runtimeStatsText());
+  if (lastChunk?.usage) {
+    // If streaming finished before ending, we would not have usage.
+    console.log(lastChunk.usage);
+  }
 }
 
 // Run one of the function below

diff --git a/examples/get-started/src/get_started.ts b/examples/get-started/src/get_started.ts
@@ -52,7 +52,7 @@ async function main() {
     top_logprobs: 2,
   });
   console.log(reply0);
-  console.log(await engine.runtimeStatsText());
+  console.log(reply0.usage);
 
   // To change model, either create a new engine via `CreateMLCEngine()`, or call `engine.reload(modelId)`
 }

diff --git a/examples/json-mode/src/json_mode.ts b/examples/json-mode/src/json_mode.ts
@@ -34,7 +34,7 @@ async function main() {
   const reply0 = await engine.chatCompletion(request);
   console.log(reply0);
   console.log("First reply's last choice:\n" + (await engine.getMessage()));
-  console.log(await engine.runtimeStatsText());
+  console.log(reply0.usage);
 }
 
 main();
diff --git a/examples/json-schema/src/json_schema.ts b/examples/json-schema/src/json_schema.ts
@@ -62,7 +62,7 @@ async function simpleStructuredTextExample() {
   const reply0 = await engine.chatCompletion(request);
   console.log(reply0);
   console.log("Output:\n" + (await engine.getMessage()));
-  console.log(await engine.runtimeStatsText());
+  console.log(reply0.usage);
 }
 
 // The json schema and prompt is taken from
@@ -129,7 +129,7 @@ async function harryPotterExample() {
   const reply = await engine.chatCompletion(request);
   console.log(reply);
   console.log("Output:\n" + (await engine.getMessage()));
-  console.log(await engine.runtimeStatsText());
+  console.log(reply.usage);
 }
 
 async function functionCallingExample() {
@@ -207,7 +207,7 @@ async function functionCallingExample() {
   const reply = await engine.chat.completions.create(request);
   console.log(reply.choices[0].message.content);
 
-  console.log(await engine.runtimeStatsText());
+  console.log(reply.usage);
 }
 
 async function main() {

diff --git a/examples/multi-round-chat/src/multi_round_chat.ts b/examples/multi-round-chat/src/multi_round_chat.ts
@@ -43,6 +43,7 @@ async function main() {
   const replyMessage0 = await engine.getMessage();
   console.log(reply0);
   console.log(replyMessage0);
+  console.log(reply0.usage);
 
   // Round 1
   // Append generated response to messages
@@ -62,6 +63,7 @@ async function main() {
   const replyMessage1 = await engine.getMessage();
   console.log(reply1);
   console.log(replyMessage1);
+  console.log(reply1.usage);
 
   // If we used multiround chat, request1 should only prefill a small number of tokens
   const prefillTokens0 = reply0.usage?.prompt_tokens;
@@ -75,8 +77,6 @@ async function main() {
   ) {
     throw Error("Multi-round chat is not triggered as expected.");
   }
-
-  console.log(await engine.runtimeStatsText());
 }
 
 main();
diff --git a/examples/seed-to-reproduce/src/seed.ts b/examples/seed-to-reproduce/src/seed.ts
@@ -38,6 +38,7 @@ async function main() {
   const reply0 = await engine.chat.completions.create(request);
   console.log(reply0);
   console.log("First reply's last choice:\n" + (await engine.getMessage()));
+  console.log(reply0.usage);
 
   const reply1 = await engine.chat.completions.create(request);
   console.log(reply1);
@@ -56,7 +57,7 @@ async function main() {
     }
   }
 
-  console.log(await engine.runtimeStatsText());
+  console.log(reply1.usage);
 }
 
 // Run one of the functions

diff --git a/examples/service-worker/src/main.ts b/examples/service-worker/src/main.ts
@@ -65,7 +65,7 @@ async function mainNonStreaming() {
   console.log(reply0);
   setLabel("generate-label", reply0.choices[0].message.content || "");
 
-  console.log(await engine.runtimeStatsText());
+  console.log(reply0.usage);
 }
 
 /**
@@ -101,17 +101,22 @@ async function mainStreaming() {
 
   const asyncChunkGenerator = await engine.chat.completions.create(request);
   let message = "";
+  let lastChunk: webllm.ChatCompletionChunk | undefined = undefined;
   for await (const chunk of asyncChunkGenerator) {
     console.log(chunk);
     if (chunk.choices[0].delta.content) {
       // Last chunk has undefined content
       message += chunk.choices[0].delta.content;
     }
     setLabel("generate-label", message);
+    lastChunk = chunk;
     // engine.interruptGenerate();  // works with interrupt as well
   }
   console.log("Final message:\n", await engine.getMessage()); // the concatenated message
-  console.log(await engine.runtimeStatsText());
+  if (lastChunk?.usage) {
+    // If streaming finished before ending, we would not have usage.
+    console.log(lastChunk.usage);
+  }
 }
 
 registerServiceWorker();

diff --git a/examples/streaming/src/streaming.ts b/examples/streaming/src/streaming.ts
@@ -37,17 +37,22 @@ async function main() {
 
   const asyncChunkGenerator = await engine.chat.completions.create(request);
   let message = "";
+  let lastChunk: webllm.ChatCompletionChunk | undefined = undefined;
   for await (const chunk of asyncChunkGenerator) {
     console.log(chunk);
     if (chunk.choices[0].delta.content) {
       // Last chunk has undefined content
       message += chunk.choices[0].delta.content;
     }
     setLabel("generate-label", message);
+    lastChunk = chunk;
     // engine.interruptGenerate();  // works with interrupt as well
   }
   console.log("Final message:\n", await engine.getMessage()); // the concatenated message
-  console.log(await engine.runtimeStatsText());
+  if (lastChunk?.usage) {
+    // If streaming finished before ending, we would not have usage.
+    console.log(lastChunk.usage);
+  }
 }
 
 main();
diff --git a/src/engine.ts b/src/engine.ts
@@ -446,6 +446,14 @@ export class MLCEngine implements MLCEngineInterface {
       ) as Array<ChatCompletionChunk.Choice.Delta.ToolCall>;
     }
 
+    const completion_tokens =
+      this.getPipeline().getCurRoundDecodingTotalTokens();
+    const prompt_tokens = this.getPipeline().getCurRoundPrefillTotalTokens();
+    const prefill_tokens_per_s =
+      this.getPipeline().getCurRoundPrefillTokensPerSec();
+    const decode_tokens_per_s =
+      this.getPipeline().getCurRoundDecodingTokensPerSec();
+
     const lastChunk: ChatCompletionChunk = {
       id: id,
       choices: [
@@ -456,14 +464,20 @@ export class MLCEngine implements MLCEngineInterface {
                 tool_calls: tool_calls,
               }
             : {},
-          // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
           finish_reason: finish_reason,
           index: 0,
         },
       ],
       model: model,
       object: "chat.completion.chunk",
       created: created,
+      usage: {
+        completion_tokens: completion_tokens,
+        prompt_tokens: prompt_tokens,
+        total_tokens: completion_tokens + prompt_tokens,
+        prefill_tokens_per_s: prefill_tokens_per_s,
+        decode_tokens_per_s: decode_tokens_per_s,
+      } as CompletionUsage,
     };
     yield lastChunk;
   }
@@ -522,6 +536,8 @@ export class MLCEngine implements MLCEngineInterface {
     const choices: Array<ChatCompletion.Choice> = [];
     let completion_tokens = 0;
     let prompt_tokens = 0;
+    let prefill_time = 0;
+    let decode_time = 0;
     for (let i = 0; i < n; i++) {
       let outputMessage: string;
       if (this.interruptSignal) {
@@ -573,6 +589,8 @@ export class MLCEngine implements MLCEngineInterface {
       });
       completion_tokens += this.getPipeline().getCurRoundDecodingTotalTokens();
       prompt_tokens += this.getPipeline().getCurRoundPrefillTotalTokens();
+      prefill_time += this.getPipeline().getCurRoundPrefillTotalTime();
+      decode_time += this.getPipeline().getCurRoundDecodingTotalTime();
     }
 
     const response: ChatCompletion = {
@@ -585,6 +603,8 @@ export class MLCEngine implements MLCEngineInterface {
         completion_tokens: completion_tokens,
         prompt_tokens: prompt_tokens,
         total_tokens: completion_tokens + prompt_tokens,
+        prefill_tokens_per_s: prompt_tokens / prefill_time,
+        decode_tokens_per_s: completion_tokens / decode_time,
       } as CompletionUsage,
     };
 

diff --git a/src/llm_chat.ts b/src/llm_chat.ts
@@ -70,9 +70,11 @@ export class LLMChatPipeline {
   private decodingTotalTokens = 0;
   private prefillTotalTime = 0;
   private prefillTotalTokens = 0;
-  // same as `prefillTotalTokens` and `decodingTotalTokens`, but reset at every `prefillStep()`
+  // same stats as above, but reset at every `prefillStep()`
   private curRoundDecodingTotalTokens = 0;
   private curRoundPrefillTotalTokens = 0;
+  private curRoundDecodingTotalTime = 0;
+  private curRoundPrefillTotalTime = 0;
 
   // LogitProcessor
   private logitProcessor?: LogitProcessor = undefined;
@@ -356,6 +358,20 @@ export class LLMChatPipeline {
     return this.curRoundPrefillTotalTokens;
   }
 
+  /**
+   * @returns the time spent on decode for a single request or a single choice in the request.
+   */
+  getCurRoundDecodingTotalTime(): number {
+    return this.curRoundDecodingTotalTime;
+  }
+
+  /**
+   * @returns the time spent on  for a single request or a single choice in the request.
+   */
+  getCurRoundPrefillTotalTime(): number {
+    return this.curRoundPrefillTotalTime;
+  }
+
   /**
    * @returns Runtime stats information.
    */
@@ -366,6 +382,30 @@ export class LLMChatPipeline {
     );
   }
 
+  /**
+   * @returns Runtime stats information, starting from the last prefill performed.
+   */
+  curRoundRuntimeStatsText(): string {
+    return (
+      `prefill: ${this.getCurRoundPrefillTokensPerSec().toFixed(4)} tokens/sec, ` +
+      `decoding: ${this.getCurRoundDecodingTokensPerSec().toFixed(4)} tokens/sec`
+    );
+  }
+
+  /**
+   * @returns Prefill tokens per second, starting from the last prefill performed.
+   */
+  getCurRoundPrefillTokensPerSec(): number {
+    return this.curRoundPrefillTotalTokens / this.curRoundPrefillTotalTime;
+  }
+
+  /**
+   * @returns Prefill tokens per second, starting from the last prefill performed.
+   */
+  getCurRoundDecodingTokensPerSec(): number {
+    return this.curRoundDecodingTotalTokens / this.curRoundDecodingTotalTime;
+  }
+
   /**
    * Set the seed for the RNG `this.tvm.rng`.
    */
@@ -411,6 +451,8 @@ export class LLMChatPipeline {
     this.tokenLogprobArray = [];
     this.curRoundDecodingTotalTokens = 0;
     this.curRoundPrefillTotalTokens = 0;
+    this.curRoundPrefillTotalTime = 0;
+    this.curRoundDecodingTotalTime = 0;
     this.stopTriggered = false;
     const conversation = this.conversation;
 
@@ -481,6 +523,7 @@ export class LLMChatPipeline {
     this.prefillTotalTime += (tend - tstart) / 1e3;
     this.prefillTotalTokens += promptTokens.length;
     this.curRoundPrefillTotalTokens += promptTokens.length;
+    this.curRoundPrefillTotalTime += (tend - tstart) / 1e3;
 
     this.processNextToken(nextToken, genConfig);
   }
@@ -508,6 +551,7 @@ export class LLMChatPipeline {
     this.decodingTotalTime += (tend - tstart) / 1e3;
     this.decodingTotalTokens += 1;
     this.curRoundDecodingTotalTokens += 1;
+    this.curRoundDecodingTotalTime += (tend - tstart) / 1e3;
 
     this.processNextToken(nextToken, genConfig);
   }
@@ -991,10 +1035,12 @@ export class LLMChatPipeline {
       this.prefillTotalTime += (tend - tstart) / 1e3;
       this.prefillTotalTokens += inputIds.length;
       this.curRoundPrefillTotalTokens += inputIds.length;
+      this.curRoundPrefillTotalTime += (tend - tstart) / 1e3;
     } else {
       this.decodingTotalTime += (tend - tstart) / 1e3;
       this.decodingTotalTokens += 1;
       this.curRoundDecodingTotalTokens += 1;
+      this.curRoundDecodingTotalTime += (tend - tstart) / 1e3;
     }
     return nextToken;
   }