Improving example text: summarization and streaming

matlab-deep-learning · Apr 13, 2024 · 92700a0 · 92700a0
1 parent cb4a442
commit 92700a0
Show file tree

Hide file tree

Showing 4 changed files with 266 additions and 0 deletions.
diff --git a/examples/ExampleStreaming.m b/examples/ExampleStreaming.m
@@ -0,0 +1,142 @@
+%% Process Generated Text in Real Time by Using ChatGPT in Streaming Mode
+% This example shows how to process generated text in real time by using ChatGPT 
+% in streaming mode.
+% 
+% By default, when you pass a prompt to ChatGPT, it generates a response internally 
+% and then outputs it in full at the end. To print out and format generated text 
+% as the model is generating it, use the |StreamFun| name-value argument of the 
+% |openAIChat| class. The streaming function is a custom function handle that 
+% tells the model what to do with the output.
+% 
+% The example includes two parts:
+%% 
+% * First, define and use a custom streaming function to print out generated 
+% text directly as the model generates it.
+% * Then, create an HTML UI Component and define and use a custom streaming 
+% function to update the UI Component in real time as the model generates text.
+%% 
+% To run this example, you need a valid API key from a paid OpenAI API account.
+
+loadenv(".env")
+addpath('..') 
+%% Print Stream Directly to Screen
+% In this example, the streamed output is printed directly to the screen. 
+% 
+% Define the function to print the returned tokens. 
+
+function printToken(token)
+    fprintf("%s",token);
+end
+%% 
+% Create the chat object with the defined function as a handle. 
+
+chat = openAIChat(StreamFun=@printToken);
+%% 
+% Generate response to a prompt in streaming mode. 
+
+prompt = "What is Model-Based Design?";
+generate(chat, prompt, MaxNumTokens=500);
+%% Print Stream to HTML UI Component
+% In this example, the streamed output is printed to the HTML component. 
+% 
+% Create the HTML UI component.
+
+fig = uifigure;
+h = uihtml(fig,Position=[50,10,450,400]);
+%% 
+% Initialize the content of the HTML UI component.
+
+resetTable(h);
+%% 
+% Create the chat object with the function handle, which requires the |uihtml| 
+% object created earlier. 
+
+chat = openAIChat(StreamFun=@(x)printStream(h,x));
+%% 
+% Add the user prompt to the table in the HTML UI component.
+
+userPrompt = "Tell me 5 jokes.";
+addChat(h,"user",userPrompt,"new")
+%% 
+% Generate response to a prompt in streaming mode. 
+
+[txt, message, response] = generate(chat,userPrompt);
+%% 
+% Update the last row with the final output. This is necessary if further update 
+% is needed to support additional HTML formatting.
+
+addChat(h,"assistant",txt,"current")
+%% Helper functions
+% |resetTable|:
+%% 
+% # Adds the basic HTML structure and the JavaScript that process the data change 
+% in MATLAB.
+% # The JavaScript gets a reference to the table and changed data and if the 
+% 3rd element in the data is "new", adds a new row. 
+% # It populates the new row with two cells and update the cells from the first 
+% two elements of the data. 
+% # The new row is then appended to the table. 
+% # Otherwise, the JavaScript gets reference to the last cell of the last row 
+% of the table, and update it with the 2nd element of the data.
+
+function resetTable(obj)
+    %RESETTABLE initialize the HTML UI component in the input argument.  
+    mustBeA(obj,'matlab.ui.control.HTML')
+    obj.HTMLSource =  ['<html><body><table>' ...
+        '<tr><th>Role</th><th>Content</th></tr></table><script>', ...
+        'function setup(htmlComponent) {', ...
+        'htmlComponent.addEventListener("DataChanged", function(event) {', ... 
+        'var table = document.querySelector("table");' ...
+        'var changedData = htmlComponent.Data;', ...
+        'if (changedData[2] == "new") {', ...
+        'var newRow = document.createElement("tr");', ...
+        'var cell1 = document.createElement("td");', ...                    
+        'var cell2 = document.createElement("td");', ...
+        'cell1.innerHTML = changedData[0];', ...
+        'cell2.innerHTML = changedData[1];', ... 
+        'newRow.appendChild(cell1);', ...
+        'newRow.appendChild(cell2);', ...
+        'table.appendChild(newRow);', ...
+        '} else { ', ...
+        'var lastRow = table.rows[table.rows.length - 1];', ...
+        'var lastCell = lastRow.cells[lastRow.cells.length - 1];', ...
+        'lastCell.innerHTML = changedData[1];', ...
+        '}});}</script></body></html>'];
+    obj.Data = [];
+    drawnow
+end
+%% 
+% |addRow| adds a new row to the table in the HTML UI component
+
+function addChat(obj,role,content,row)
+    %ADDCHAT adds a new row or updates the last row of the table
+    mustBeA(obj,'matlab.ui.control.HTML')
+    content = replace(content,newline,"<br>");
+    obj.Data = {role,content,row};
+    drawnow
+end
+%% 
+% |printStream| is the streaming function and prints the stream in the table 
+% in the HTML UI component
+
+function printStream(h,x)
+    %PRINTSTREAM prints the stream in a new row in the table
+    if strlength(x) == 0
+        % if the first token is 0 length, add a new row
+        tokens = string(x);
+        h.Data = {"assistant",tokens,"new"};
+    else
+        % otherwise append the new token to the previous tokens
+        % if the new token contains a line break, replace 
+        % it with <br>
+        if contains(x,newline)
+            x = replace(x,newline,"<br>");
+        end
+        tokens = h.Data{2} + string(x);
+        % update the existing row. 
+        h.Data = {"assistant",tokens,"current"};
+    end
+    drawnow
+end
+%% 
+% _Copyright 2024 The MathWorks, Inc._
diff --git a/examples/ExampleStreaming.mlx b/examples/ExampleStreaming.mlx
diff --git a/examples/ExampleSummarization.m b/examples/ExampleSummarization.m
@@ -0,0 +1,124 @@
+%% Summarize Large Documents Using ChatGPT and MATLAB
+% This example shows how to use ChatGPT to summarize documents that are too 
+% large to be summarized at once.
+% 
+% To summarize short documents using ChatGPT, you can pass the documents directly 
+% as a prompt together with an instruction to summarize them. However, ChatGPT 
+% can only process prompts of limited size.
+% 
+% To summarize documents that are larger than this limit, split the documents 
+% up into smaller documents. Summarize the smaller document chunks, then pass 
+% all of the summaries to ChatGPT to generate one overall summary.
+%% 
+% * This example includes four steps:
+% * Download the complete text of "Alice in Wonderland" by Lewis Carroll from 
+% Project Gutenberg.
+% * Split the documents up into chunks of less than 3000 words. (Section title: 
+% "Split Document Into Chunks")
+% * Use ChatGPT to create summaries of each chunk. ("Summarize Chunks")
+% * Then use ChatGPT to create a summary of all of the summaries. ("Summarize 
+% Document")
+%% 
+% To run this example, you need Text Analytics Toolbox™.
+% 
+% To run this example, you need a valid API key from a paid OpenAI™ API account.
+
+loadenv(".env")
+addpath('..') 
+%% Download Text Data
+% Download and read the content from Alice's Adventures in Wonderland by Lewis 
+% Carroll from Project Gutenberg.
+% 
+% First read the contents of the webpage.
+
+options = weboptions(Timeout=30);
+code = webread("https://www.gutenberg.org/files/11/11-h/11-h.htm", options);
+longText = extractHTMLText(string(code));
+%% Split Document Into Chunks
+% Large language models have a limit in terms of how much text they can accept 
+% as input, so if you try to summarize the complete book, you will likely get 
+% an error. A workaround is splitting the book into chunks and summarize each 
+% chunk individually. The chunk size is defined in |limitChunkWords|, which restricts 
+% the numbers of words in a chunk.
+
+incrementalSummary = longText;
+limitChunkWords = 3000;
+chunks = createChunks(incrementalSummary, limitChunkWords);
+%% Summarize Chunks
+% Initialize a ChatGPT session with the role of summarizing text
+
+summarizer = openAIChat("You are a professional summarizer.");
+%% 
+% Looping process to gradually summarize the text chunk by chunk, reducing the 
+% chunk size with each iteration. 
+
+numCalls = 0;
+while numel(chunks)>1
+    summarizedChunks = strings(size(chunks));
+    numCalls = numCalls + numel(chunks);
+%% 
+% Add a limit to the number of calls, to ensure you are not making more calls 
+% than what is expected. You can change this value to match what is needed for 
+% your application.
+
+    if numCalls > 20
+        error("Document is too long to be summarized.")
+    end
+
+    for i = 1:length(chunks)
+     summarizedChunks(i) = generate(summarizer, "Summarize this content:" + newline + chunks(i));     
+    end 
+%% 
+% Merge the summarized chunks to serve as the base for the next iteration.
+
+    incrementalSummary = join(summarizedChunks);
+%% 
+% Form new chunks with a reduced size for the subsequent iteration.
+
+    chunks = createChunks(incrementalSummary, limitChunkWords);
+end
+%% Summarize Document
+% Compile the final summary by combining the summaries from all the chunks.
+
+fullSummary = generate(summarizer, "The following text is a combination of summaries. " + ...
+    "Provide a cohese and coherent summary combining these smaller summaries, preserving as much information as possible:" + newline + incrementalSummary);
+wrapText(fullSummary)
+%% |createChunks| function
+% This function segments a long text into smaller parts of a predefined size 
+% to facilitate easier summarization. It preserves the structure of sentences. 
+% The |chunkSize| should be large enough to fit at least one sentence.
+
+function chunks = createChunks(text, chunkSize)
+    % Tokenizing the input text for processing
+    text = tokenizedDocument(text);
+
+    % Splitting the tokenized text into individual sentences
+    text = splitSentences(text);
+    chunks = [];
+    currentChunk = "";
+    currentChunkSize = 0;
+
+    % Iterating through the sentences to aggregate them into chunks until the chunk 
+    % attains the predefined size, after which a new chunk is started
+    for i=1:length(text)
+        newChunkSize = currentChunkSize + doclength(text(i));
+        if newChunkSize < chunkSize
+            currentChunkSize = currentChunkSize + doclength(text(i));
+            currentChunk = currentChunk + " " + joinWords(text(i));
+        else
+            chunks = [chunks; currentChunk]; %#ok
+            currentChunkSize = doclength(text(i));
+            currentChunk = joinWords(text(i));
+        end
+    end
+end
+%% |wrapText| function
+% This function splits text into sentences and then concatenates them again 
+% using |newline| to make it easier to visualize text in this example
+
+function wrappedText = wrapText(text)
+wrappedText = splitSentences(text);
+wrappedText = join(wrappedText,newline);
+end
+%% 
+% _Copyright 2023 The MathWorks, Inc._
diff --git a/examples/ExampleSummarization.mlx b/examples/ExampleSummarization.mlx