From dd88ebc5b41a94a43a3630d44283eba5342e2c68 Mon Sep 17 00:00:00 2001 From: Roopan-Microsoft <168007406+Roopan-Microsoft@users.noreply.github.com> Date: Mon, 19 Aug 2024 10:23:21 +0530 Subject: [PATCH] feat: Text to Speech feature implementation (#1235) Co-authored-by: Rohini-Microsoft Co-authored-by: Bangarraju-Microsoft Co-authored-by: Harmanpreet Kaur --- code/create_app.py | 1 + code/frontend/src/assets/pauseIcon.svg | 3 + code/frontend/src/assets/speakerIcon.svg | 3 + .../frontend/src/components/Answer/Answer.tsx | 383 +++++++++++++----- .../QuestionInput/QuestionInput.module.css | 6 +- .../QuestionInput/QuestionInput.tsx | 22 +- code/frontend/src/pages/chat/Chat.tsx | 12 + .../backend_api/default/test_speech_token.py | 1 + code/tests/test_app.py | 2 + 9 files changed, 326 insertions(+), 107 deletions(-) create mode 100644 code/frontend/src/assets/pauseIcon.svg create mode 100644 code/frontend/src/assets/speakerIcon.svg diff --git a/code/create_app.py b/code/create_app.py index c8eba08b4..a01d37832 100644 --- a/code/create_app.py +++ b/code/create_app.py @@ -437,6 +437,7 @@ def speech_config(): if response.status_code == 200: return { "token": response.text, + "key": speech_key, "region": env_helper.AZURE_SPEECH_SERVICE_REGION, "languages": env_helper.AZURE_SPEECH_RECOGNIZER_LANGUAGES, } diff --git a/code/frontend/src/assets/pauseIcon.svg b/code/frontend/src/assets/pauseIcon.svg new file mode 100644 index 000000000..427df46b2 --- /dev/null +++ b/code/frontend/src/assets/pauseIcon.svg @@ -0,0 +1,3 @@ + + + diff --git a/code/frontend/src/assets/speakerIcon.svg b/code/frontend/src/assets/speakerIcon.svg new file mode 100644 index 000000000..23c9286ad --- /dev/null +++ b/code/frontend/src/assets/speakerIcon.svg @@ -0,0 +1,3 @@ + + + diff --git a/code/frontend/src/components/Answer/Answer.tsx b/code/frontend/src/components/Answer/Answer.tsx index 1f689a20e..3d78831f9 100644 --- a/code/frontend/src/components/Answer/Answer.tsx +++ b/code/frontend/src/components/Answer/Answer.tsx @@ -1,123 +1,310 @@ -import { useEffect, useMemo, useState, useRef } from "react"; +import { useEffect, useMemo, useState, useRef, forwardRef } from "react"; import { useBoolean } from "@fluentui/react-hooks" import { FontIcon, Stack, Text } from "@fluentui/react"; - import styles from "./Answer.module.css"; - import { AskResponse, Citation } from "../../api"; import { parseAnswer } from "./AnswerParser"; - import ReactMarkdown from "react-markdown"; import remarkGfm from "remark-gfm"; import supersub from 'remark-supersub' +import pauseIcon from "../../assets/pauseIcon.svg"; +import speakerIcon from "../../assets/speakerIcon.svg"; +import * as sdk from 'microsoft-cognitiveservices-speech-sdk'; +import * as SpeechSDK from 'microsoft-cognitiveservices-speech-sdk'; + +declare global { + interface Window { + webkitAudioContext: typeof AudioContext; + } +} interface Props { - answer: AskResponse; - onCitationClicked: (citedDocument: Citation) => void; - index: number; + answer: AskResponse; + onCitationClicked: (citedDocument: Citation) => void; + onSpeak?: any; + isActive?: boolean; + index: number; } +const MyStackComponent = forwardRef((props, ref) => ( +
+)); export const Answer = ({ - answer, - onCitationClicked, - index, + answer, + onCitationClicked, + onSpeak, + isActive, + index, }: Props) => { - const [isRefAccordionOpen, { toggle: toggleIsRefAccordionOpen }] = useBoolean(false); - const filePathTruncationLimit = 50; + const [isRefAccordionOpen, { toggle: toggleIsRefAccordionOpen }] = useBoolean(false); + const filePathTruncationLimit = 50; + const answerContainerRef = useRef(null);// read the text from the container + const messageBoxId = "message-" + index; + const [isSpeaking, setIsSpeaking] = useState(false); // for speaker on + const [showSpeaker, setShowSpeaker] = useState(true); //for show and hide the speaker icon + const [isPaused, setIsPaused] = useState(false); //for pause + const parsedAnswer = useMemo(() => parseAnswer(answer), [answer]); + const [chevronIsExpanded, setChevronIsExpanded] = useState(isRefAccordionOpen); + const refContainer = useRef(null); + const [audioContext, setAudioContext] = useState(null); //Manully manage the audio context eg pausing resuming - const messageBoxId = "message-" + index; - const parsedAnswer = useMemo(() => parseAnswer(answer), [answer]); - const [chevronIsExpanded, setChevronIsExpanded] = useState(isRefAccordionOpen); - const refContainer = useRef(null); - const handleChevronClick = () => { - setChevronIsExpanded(!chevronIsExpanded); - toggleIsRefAccordionOpen(); - }; + const [synthesizerData, setSynthesizerData] = useState({ key: '', region: '' }); + const [synthesizer, setSynthesizer] = useState(null); + const [audioDestination, setAudioDestination] = useState(null); + const [playbackTimeout, setPlaybackTimeout] = useState(null); + const [remainingDuration, setRemainingDuration] = useState(0); + const [startTime, setStartTime] = useState(null); + + const handleChevronClick = () => { + setChevronIsExpanded(!chevronIsExpanded); + toggleIsRefAccordionOpen(); + }; + + const initializeSynthesizer = () =>{ + const speechConfig = sdk.SpeechConfig.fromSubscription(synthesizerData.key, synthesizerData.region); + const newAudioDestination = new SpeechSDK.SpeakerAudioDestination(); + const audioConfig = SpeechSDK.AudioConfig.fromSpeakerOutput(newAudioDestination); + const newSynthesizer = new SpeechSDK.SpeechSynthesizer(speechConfig, audioConfig); + setSynthesizer(newSynthesizer); + setAudioDestination(newAudioDestination); + if (playbackTimeout) { + clearTimeout(playbackTimeout); + } + setRemainingDuration(0); + } + + useEffect(() => { + if (synthesizerData.key != '') { + initializeSynthesizer(); - useEffect(() => { - setChevronIsExpanded(isRefAccordionOpen); - if(chevronIsExpanded && refContainer.current){ - refContainer.current.scrollIntoView({ behavior:'smooth'}); + return () => { + if (synthesizer) { + synthesizer.close(); } - }, [chevronIsExpanded,isRefAccordionOpen]); - - const createCitationFilepath = (citation: Citation, index: number, truncate: boolean = false) => { - let citationFilename = ""; - - if (citation.filepath && citation.chunk_id != null) { - if (truncate && citation.filepath.length > filePathTruncationLimit) { - const citationLength = citation.filepath.length; - citationFilename = `${citation.filepath.substring(0, 20)}...${citation.filepath.substring(citationLength -20)} - Part ${citation.chunk_id}`; - } - else { - citationFilename = `${citation.filepath} - Part ${citation.chunk_id}`; - } + if (audioDestination) { + audioDestination.close(); } - else { - citationFilename = `Citation ${index}`; + if (playbackTimeout) { + clearTimeout(playbackTimeout); } - return citationFilename; + }; } - useEffect(() => { - const handleCopy = () => { - alert("Please consider where you paste this content."); - }; - const messageBox = document.getElementById(messageBoxId); - messageBox?.addEventListener("copy", handleCopy); - return () => { - messageBox?.removeEventListener("copy", handleCopy); - }; - }, []); - - return ( - <> - - - - - - - AI-generated content may be incorrect - - - {!!parsedAnswer.citations.length && ( - - - - - {parsedAnswer.citations.length > 1 ? parsedAnswer.citations.length + " references" : "1 reference"} - - - - - - - )} + }, [index, synthesizerData]); - - {chevronIsExpanded && -
- {parsedAnswer.citations.map((citation, idx) => { - return ( - onCitationClicked(citation)} className={styles.citationContainer}> -
{idx}
- {createCitationFilepath(citation, idx, true)} -
); - })} -
- } -
- + useEffect(() => { + const fetchSythesizerData = async () => { + const response = await fetch('/api/speech'); + if (!response.ok) { + throw new Error('Network response was not ok'); + } + const data = await response.json(); + setSynthesizerData({ key: data.key, region: data.region }); + } + fetchSythesizerData(); + }, []) + + useEffect(() => { + if (!isActive && synthesizer && isSpeaking) { + resetSpeech() + } + }, [isActive, synthesizer]); + + useEffect(() => { + setChevronIsExpanded(isRefAccordionOpen); + if (chevronIsExpanded && refContainer.current) { + refContainer.current.scrollIntoView({ behavior: 'smooth' }); + } + // After genrating answer then only show speaker icon + if (parsedAnswer.markdownFormatText === "Generating answer...") { + setShowSpeaker(false); + } else { + setShowSpeaker(true); + } + }, [chevronIsExpanded, isRefAccordionOpen, parsedAnswer]); + + const createCitationFilepath = (citation: Citation, index: number, truncate: boolean = false) => { + let citationFilename = ""; + + if (citation.filepath && citation.chunk_id != null) { + if (truncate && citation.filepath.length > filePathTruncationLimit) { + const citationLength = citation.filepath.length; + citationFilename = `${citation.filepath.substring(0, 20)}...${citation.filepath.substring(citationLength - 20)} - Part ${citation.chunk_id}`; + } + else { + citationFilename = `${citation.filepath} - Part ${citation.chunk_id}`; + } + } + else { + citationFilename = `Citation ${index}`; + } + return citationFilename; + } + + const getAnswerText = () => { + if (answerContainerRef.current) { + const text = answerContainerRef.current.textContent ?? ''; + return text; + } + return ''; + }; + + const startSpeech = () => { + if (synthesizer) { + const text = getAnswerText(); + synthesizer?.speakTextAsync( + text, + result => { + if (result.reason === SpeechSDK.ResultReason.SynthesizingAudioCompleted) { + const duration = result.audioDuration / 10000; + setRemainingDuration(duration); + setStartTime(Date.now()); + handleTimeout(duration); + } else if (result.reason === SpeechSDK.ResultReason.Canceled) { + setIsSpeaking(false); + setIsPaused(false); + } else { + console.error('Synthesis failed: ', result.errorDetails); + } + }, + error => { + console.error('Synthesis error: ', error); + setIsSpeaking(false); + setIsPaused(false); + } + ); + setIsSpeaking(true); + } + }; + + const handleTimeout = (remainingDuration: number) => { + setPlaybackTimeout( + setTimeout(() => { + setIsSpeaking(false); + setIsPaused(false); + onSpeak(index , 'stop'); + }, remainingDuration) ); + }; + + const resetSpeech = () => { + //audioDestination?.close(); + audioDestination?.pause(); + setIsSpeaking(false); + setIsPaused(false); + //synthesizer?.close(); + initializeSynthesizer(); + } + const handleSpeakPauseResume = () => { + if (isSpeaking) { + if (isPaused) { + onSpeak(index , 'speak'); + audioDestination?.resume(); + setIsPaused(false); + setStartTime(Date.now()); + handleTimeout(remainingDuration); + } else { + onSpeak(index , 'pause'); + audioDestination?.pause(); + setIsPaused(true); + const elapsed = Date.now() - (startTime || 0); + const newRemainingDuration = remainingDuration - elapsed; + setRemainingDuration(newRemainingDuration); + if (playbackTimeout) { + clearTimeout(playbackTimeout); + } + } + } else { + onSpeak(index , 'speak'); + startSpeech(); + } + }; + + useEffect(() => { + const handleCopy = () => { + alert("Please consider where you paste this content."); + }; + const messageBox = document.getElementById(messageBoxId); + messageBox?.addEventListener("copy", handleCopy); + return () => { + messageBox?.removeEventListener("copy", handleCopy); + }; + }, []); + + const getSpeechButtons = () => { + const speechStatus = !showSpeaker ? "none" : showSpeaker && !isSpeaking ? "Speak" + : isSpeaking && isPaused ? "Resume" : "Pause"; + + switch (speechStatus) { + case 'Speak': + case 'Resume': + return ( + + ) + case 'Pause': + return ( + + ) + default: + return null; + } + } + + return ( + <> + + + + + + + AI-generated content may be incorrect + + + {!!parsedAnswer.citations.length && ( + + + + + {parsedAnswer.citations.length > 1 ? parsedAnswer.citations.length + " references" : "1 reference"} + + + + + + + )} + + + {chevronIsExpanded && +
+ {parsedAnswer.citations.map((citation, idx) => { + return ( + onCitationClicked(citation)} className={styles.citationContainer}> +
{idx}
+ {createCitationFilepath(citation, idx, true)} +
); + })} +
+ } + + {getSpeechButtons()} + +
+ + ); }; diff --git a/code/frontend/src/components/QuestionInput/QuestionInput.module.css b/code/frontend/src/components/QuestionInput/QuestionInput.module.css index 5788340fd..146d075c1 100644 --- a/code/frontend/src/components/QuestionInput/QuestionInput.module.css +++ b/code/frontend/src/components/QuestionInput/QuestionInput.module.css @@ -27,7 +27,7 @@ right: 24px; bottom: 20px; position: static; - margin-right: 12px; + /* margin-right: 12px; */ } .questionInputSendButton { @@ -39,7 +39,7 @@ pointer-events: none; width: 24px; height: 23px; - margin-right: 12px; + /* margin-right: 12px; */ } .questionInputSendButtonDisabled { @@ -75,6 +75,8 @@ width: 24px; height: 24px; margin-bottom: 12px; + border: none; + background: transparent; } .microphoneAndSendContainer { diff --git a/code/frontend/src/components/QuestionInput/QuestionInput.tsx b/code/frontend/src/components/QuestionInput/QuestionInput.tsx index 0d3bd8e8d..6a8cbef6f 100644 --- a/code/frontend/src/components/QuestionInput/QuestionInput.tsx +++ b/code/frontend/src/components/QuestionInput/QuestionInput.tsx @@ -17,6 +17,7 @@ interface Props { recognizedText: string; isListening: boolean; isRecognizing: boolean; + isTextToSpeachActive : boolean; setRecognizedText: (text: string) => void; } @@ -32,11 +33,13 @@ export const QuestionInput = ({ isListening, isRecognizing, setRecognizedText, + isTextToSpeachActive }: Props) => { const [question, setQuestion] = useState(""); const [liveRecognizedText, setLiveRecognizedText] = useState(""); const [microphoneIconActive, setMicrophoneIconActive] = useState(false); + const [isMicrophoneDisabled , setIsMicrophoneDisabled] = useState(false); const [isTextAreaDisabled, setIsTextAreaDisabled] = useState(false); useEffect(() => { if (isRecognizing) { @@ -48,6 +51,9 @@ export const QuestionInput = ({ setMicrophoneIconActive(false); // Set microphone icon to inactive } }, [recognizedText, isRecognizing]); + useEffect(()=>{ + setIsMicrophoneDisabled(isTextToSpeachActive); + },[isTextToSpeachActive]) const sendQuestion = () => { if (disabled || (!question.trim() && !liveRecognizedText.trim())) { return; @@ -103,12 +109,14 @@ export const QuestionInput = ({ />
{/* Microphone Icon */} -
e.key === "Enter" || e.key === " " - ? isListening + ? (isListening) ? onStopClick() : onMicrophoneClick() : null @@ -117,20 +125,20 @@ export const QuestionInput = ({ tabIndex={0} aria-label="Microphone button" > - {microphoneIconActive ? ( + {microphoneIconActive || isMicrophoneDisabled ? ( ) : ( - Microphone )} -
+ {/* Send Button */} {isSendButtonDisabled?( ):( diff --git a/code/frontend/src/pages/chat/Chat.tsx b/code/frontend/src/pages/chat/Chat.tsx index 9f415caba..14b68a670 100644 --- a/code/frontend/src/pages/chat/Chat.tsx +++ b/code/frontend/src/pages/chat/Chat.tsx @@ -60,6 +60,8 @@ const Chat = () => { const [isListening, setIsListening] = useState(false); const recognizerRef = useRef(null); const [assistantType, setAssistantType] = useState(""); + const [activeCardIndex, setActiveCardIndex] = useState(null); + const [isTextToSpeachActive , setIsTextToSpeachActive] = useState(false); const makeApiRequest = async (question: string) => { lastQuestionRef.current = question; @@ -261,6 +263,13 @@ const Chat = () => { return []; }; + const handleSpeech = (index: number, status : string) => { + if(status != 'pause') + setActiveCardIndex(index); + setIsTextToSpeachActive(status =='speak' ? true : false) + }; + + return (
@@ -314,6 +323,8 @@ const Chat = () => { ? parseCitationFromMessage(answers[index - 1]) : [], }} + onSpeak={handleSpeech} + isActive={activeCardIndex === index} onCitationClicked={(c) => onShowCitation(c)} index={index} /> @@ -399,6 +410,7 @@ const Chat = () => { isListening={isListening} isRecognizing={isRecognizing} setRecognizedText={setRecognizedText} + isTextToSpeachActive = {isTextToSpeachActive} />
diff --git a/code/tests/functional/tests/backend_api/default/test_speech_token.py b/code/tests/functional/tests/backend_api/default/test_speech_token.py index 1388b0cd5..401f92191 100644 --- a/code/tests/functional/tests/backend_api/default/test_speech_token.py +++ b/code/tests/functional/tests/backend_api/default/test_speech_token.py @@ -20,6 +20,7 @@ def test_speech_token_returned(app_url: str, app_config: AppConfig): "token": "speech-token", "region": app_config.get("AZURE_SPEECH_SERVICE_REGION"), "languages": app_config.get("AZURE_SPEECH_RECOGNIZER_LANGUAGES").split(","), + "key": "some-azure-speech-service-key" } assert response.headers["Content-Type"] == "application/json" diff --git a/code/tests/test_app.py b/code/tests/test_app.py index b671dbb65..75deba2ce 100644 --- a/code/tests/test_app.py +++ b/code/tests/test_app.py @@ -110,6 +110,7 @@ def test_returns_speech_token_using_keys( "token": "speech-token", "region": AZURE_SPEECH_SERVICE_REGION, "languages": AZURE_SPEECH_RECOGNIZER_LANGUAGES, + "key": "mock-speech-key" } requests.post.assert_called_once_with( @@ -153,6 +154,7 @@ def test_returns_speech_token_using_rbac( "token": "speech-token", "region": AZURE_SPEECH_SERVICE_REGION, "languages": AZURE_SPEECH_RECOGNIZER_LANGUAGES, + "key": "mock-key1" } requests.post.assert_called_once_with(