From dd88ebc5b41a94a43a3630d44283eba5342e2c68 Mon Sep 17 00:00:00 2001
From: Roopan-Microsoft <168007406+Roopan-Microsoft@users.noreply.github.com>
Date: Mon, 19 Aug 2024 10:23:21 +0530
Subject: [PATCH] feat: Text to Speech feature implementation (#1235)

Co-authored-by: Rohini-Microsoft <v-rwalunj@microsoft.com>
Co-authored-by: Bangarraju-Microsoft <v-golib@microsoft.com>
Co-authored-by: Harmanpreet Kaur <v-harmanpkau@microsoft.com>
---
 code/create_app.py                            |   1 +
 code/frontend/src/assets/pauseIcon.svg        |   3 +
 code/frontend/src/assets/speakerIcon.svg      |   3 +
 .../frontend/src/components/Answer/Answer.tsx | 383 +++++++++++++-----
 .../QuestionInput/QuestionInput.module.css    |   6 +-
 .../QuestionInput/QuestionInput.tsx           |  22 +-
 code/frontend/src/pages/chat/Chat.tsx         |  12 +
 .../backend_api/default/test_speech_token.py  |   1 +
 code/tests/test_app.py                        |   2 +
 9 files changed, 326 insertions(+), 107 deletions(-)
 create mode 100644 code/frontend/src/assets/pauseIcon.svg
 create mode 100644 code/frontend/src/assets/speakerIcon.svg
diff --git a/code/create_app.py b/code/create_app.py
index c8eba08b4..a01d37832 100644
--- a/code/create_app.py
+++ b/code/create_app.py
@@ -437,6 +437,7 @@ def speech_config():
             if response.status_code == 200:
                 return {
                     "token": response.text,
+                    "key": speech_key,
                     "region": env_helper.AZURE_SPEECH_SERVICE_REGION,
                     "languages": env_helper.AZURE_SPEECH_RECOGNIZER_LANGUAGES,
                 }
diff --git a/code/frontend/src/assets/pauseIcon.svg b/code/frontend/src/assets/pauseIcon.svg
new file mode 100644
index 000000000..427df46b2
--- /dev/null
+++ b/code/frontend/src/assets/pauseIcon.svg
@@ -0,0 +1,3 @@
+<svg width="24" height="24" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path d="M6.25 3C5.00736 3 4 4.00736 4 5.25V18.75C4 19.9926 5.00736 21 6.25 21H8.75C9.99264 21 11 19.9926 11 18.75V5.25C11 4.00736 9.99264 3 8.75 3H6.25ZM5.5 5.25C5.5 4.83579 5.83579 4.5 6.25 4.5H8.75C9.16421 4.5 9.5 4.83579 9.5 5.25V18.75C9.5 19.1642 9.16421 19.5 8.75 19.5H6.25C5.83579 19.5 5.5 19.1642 5.5 18.75V5.25ZM15.25 3C14.0074 3 13 4.00736 13 5.25V18.75C13 19.9926 14.0074 21 15.25 21H17.75C18.9926 21 20 19.9926 20 18.75V5.25C20 4.00736 18.9926 3 17.75 3H15.25ZM14.5 5.25C14.5 4.83579 14.8358 4.5 15.25 4.5H17.75C18.1642 4.5 18.5 4.83579 18.5 5.25V18.75C18.5 19.1642 18.1642 19.5 17.75 19.5H15.25C14.8358 19.5 14.5 19.1642 14.5 18.75V5.25Z" fill="#707070"/>
+</svg>
diff --git a/code/frontend/src/assets/speakerIcon.svg b/code/frontend/src/assets/speakerIcon.svg
new file mode 100644
index 000000000..23c9286ad
--- /dev/null
+++ b/code/frontend/src/assets/speakerIcon.svg
@@ -0,0 +1,3 @@
+<svg width="24" height="24" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path d="M15 4.25049C15 3.17187 13.7255 2.59964 12.9195 3.31631L8.42794 7.30958C8.29065 7.43165 8.11333 7.49907 7.92961 7.49907H4.25C3.00736 7.49907 2 8.50643 2 9.74907V14.247C2 15.4896 3.00736 16.497 4.25 16.497H7.92956C8.11329 16.497 8.29063 16.5644 8.42793 16.6865L12.9194 20.6802C13.7255 21.397 15 20.8247 15 19.7461V4.25049ZM9.4246 8.43059L13.5 4.80728V19.1893L9.42465 15.5655C9.01275 15.1993 8.48074 14.997 7.92956 14.997H4.25C3.83579 14.997 3.5 14.6612 3.5 14.247V9.74907C3.5 9.33486 3.83579 8.99907 4.25 8.99907H7.92961C8.48075 8.99907 9.01272 8.79679 9.4246 8.43059ZM18.9916 5.89782C19.3244 5.65128 19.7941 5.72126 20.0407 6.05411C21.2717 7.71619 22 9.77439 22 12.0005C22 14.2266 21.2717 16.2848 20.0407 17.9469C19.7941 18.2798 19.3244 18.3497 18.9916 18.1032C18.6587 17.8567 18.5888 17.387 18.8353 17.0541C19.8815 15.6416 20.5 13.8943 20.5 12.0005C20.5 10.1067 19.8815 8.35945 18.8353 6.9469C18.5888 6.61404 18.6587 6.14435 18.9916 5.89782ZM17.143 8.36982C17.5072 8.17262 17.9624 8.30806 18.1596 8.67233C18.6958 9.66294 19 10.7973 19 12.0005C19 13.2037 18.6958 14.338 18.1596 15.3287C17.9624 15.6929 17.5072 15.8284 17.143 15.6312C16.7787 15.434 16.6432 14.9788 16.8404 14.6146C17.2609 13.8378 17.5 12.9482 17.5 12.0005C17.5 11.0528 17.2609 10.1632 16.8404 9.38642C16.6432 9.02216 16.7787 8.56701 17.143 8.36982Z" fill="#707070"/>
+</svg>
diff --git a/code/frontend/src/components/Answer/Answer.tsx b/code/frontend/src/components/Answer/Answer.tsx
index 1f689a20e..3d78831f9 100644
--- a/code/frontend/src/components/Answer/Answer.tsx
+++ b/code/frontend/src/components/Answer/Answer.tsx
@@ -1,123 +1,310 @@
-import { useEffect, useMemo, useState, useRef } from "react";
+import { useEffect, useMemo, useState, useRef, forwardRef } from "react";
 import { useBoolean } from "@fluentui/react-hooks"
 import { FontIcon, Stack, Text } from "@fluentui/react";
-
 import styles from "./Answer.module.css";
-
 import { AskResponse, Citation } from "../../api";
 import { parseAnswer } from "./AnswerParser";
-
 import ReactMarkdown from "react-markdown";
 import remarkGfm from "remark-gfm";
 import supersub from 'remark-supersub'
+import pauseIcon from "../../assets/pauseIcon.svg";
+import speakerIcon from "../../assets/speakerIcon.svg";
+import * as sdk from 'microsoft-cognitiveservices-speech-sdk';
 
+import * as SpeechSDK from 'microsoft-cognitiveservices-speech-sdk';
+
+declare global {
+  interface Window {
+    webkitAudioContext: typeof AudioContext;
+  }
+}
 interface Props {
-    answer: AskResponse;
-    onCitationClicked: (citedDocument: Citation) => void;
-    index: number;
+  answer: AskResponse;
+  onCitationClicked: (citedDocument: Citation) => void;
+  onSpeak?: any;
+  isActive?: boolean;
+  index: number;
 }
+const MyStackComponent = forwardRef<HTMLDivElement, any>((props, ref) => (
+  <div {...props} ref={ref} />
+));
 
 export const Answer = ({
-    answer,
-    onCitationClicked,
-    index,
+  answer,
+  onCitationClicked,
+  onSpeak,
+  isActive,
+  index,
 }: Props) => {
-    const [isRefAccordionOpen, { toggle: toggleIsRefAccordionOpen }] = useBoolean(false);
-    const filePathTruncationLimit = 50;
+  const [isRefAccordionOpen, { toggle: toggleIsRefAccordionOpen }] = useBoolean(false);
+  const filePathTruncationLimit = 50;
+  const answerContainerRef = useRef<HTMLDivElement>(null);// read the text from the container
+  const messageBoxId = "message-" + index;
+  const [isSpeaking, setIsSpeaking] = useState(false); // for speaker on
+  const [showSpeaker, setShowSpeaker] = useState(true); //for show and hide the speaker icon
+  const [isPaused, setIsPaused] = useState(false); //for pause
+  const parsedAnswer = useMemo(() => parseAnswer(answer), [answer]);
+  const [chevronIsExpanded, setChevronIsExpanded] = useState(isRefAccordionOpen);
+  const refContainer = useRef<HTMLDivElement>(null);
+  const [audioContext, setAudioContext] = useState<AudioContext | null>(null); //Manully  manage the audio context eg pausing resuming
 
-    const messageBoxId = "message-" + index;
 
-    const parsedAnswer = useMemo(() => parseAnswer(answer), [answer]);
-    const [chevronIsExpanded, setChevronIsExpanded] = useState(isRefAccordionOpen);
-    const refContainer = useRef<HTMLDivElement>(null);
-    const handleChevronClick = () => {
-        setChevronIsExpanded(!chevronIsExpanded);
-        toggleIsRefAccordionOpen();
-      };
+  const [synthesizerData, setSynthesizerData] = useState({ key: '', region: '' });
+  const [synthesizer, setSynthesizer] = useState<SpeechSDK.SpeechSynthesizer | null>(null);
+  const [audioDestination, setAudioDestination] = useState<SpeechSDK.SpeakerAudioDestination | null>(null);
+  const [playbackTimeout, setPlaybackTimeout] = useState<NodeJS.Timeout | null>(null);
+  const [remainingDuration, setRemainingDuration] = useState<number>(0);
+  const [startTime, setStartTime] = useState<number | null>(null);
+
+  const handleChevronClick = () => {
+    setChevronIsExpanded(!chevronIsExpanded);
+    toggleIsRefAccordionOpen();
+  };
+
+  const initializeSynthesizer = () =>{
+    const speechConfig = sdk.SpeechConfig.fromSubscription(synthesizerData.key, synthesizerData.region);
+    const newAudioDestination = new SpeechSDK.SpeakerAudioDestination();
+    const audioConfig = SpeechSDK.AudioConfig.fromSpeakerOutput(newAudioDestination);
+    const newSynthesizer = new SpeechSDK.SpeechSynthesizer(speechConfig, audioConfig);
+    setSynthesizer(newSynthesizer);
+    setAudioDestination(newAudioDestination);
+    if (playbackTimeout) {
+      clearTimeout(playbackTimeout); 
+    }
+    setRemainingDuration(0); 
+  }
+
+  useEffect(() => {
+    if (synthesizerData.key != '') {
+      initializeSynthesizer();
 
-    useEffect(() => {
-        setChevronIsExpanded(isRefAccordionOpen);
-        if(chevronIsExpanded && refContainer.current){
-            refContainer.current.scrollIntoView({ behavior:'smooth'});
+      return () => {
+        if (synthesizer) {
+          synthesizer.close();
         }
-    }, [chevronIsExpanded,isRefAccordionOpen]);
-
-    const createCitationFilepath = (citation: Citation, index: number, truncate: boolean = false) => {
-        let citationFilename = "";
-
-        if (citation.filepath && citation.chunk_id != null) {
-            if (truncate && citation.filepath.length > filePathTruncationLimit) {
-                const citationLength = citation.filepath.length;
-                citationFilename = `${citation.filepath.substring(0, 20)}...${citation.filepath.substring(citationLength -20)} - Part ${citation.chunk_id}`;
-            }
-            else {
-                citationFilename = `${citation.filepath} - Part ${citation.chunk_id}`;
-            }
+        if (audioDestination) {
+          audioDestination.close();
         }
-        else {
-            citationFilename = `Citation ${index}`;
+        if (playbackTimeout) {
+          clearTimeout(playbackTimeout);
         }
-        return citationFilename;
+      };
     }
 
-    useEffect(() => {
-        const handleCopy = () => {
-            alert("Please consider where you paste this content.");
-        };
-        const messageBox = document.getElementById(messageBoxId);
-        messageBox?.addEventListener("copy", handleCopy);
-        return () => {
-            messageBox?.removeEventListener("copy", handleCopy);
-        };
-    }, []);
-
-    return (
-        <>
-            <Stack className={styles.answerContainer} id={messageBoxId}>
-                <Stack.Item grow>
-                    <ReactMarkdown
-                        remarkPlugins={[remarkGfm, supersub]}
-                        children={parsedAnswer.markdownFormatText}
-                        className={styles.answerText}
-                    />
-                </Stack.Item>
-                <Stack horizontal className={styles.answerFooter} verticalAlign="start">
-                <Stack.Item className={styles.answerDisclaimerContainer}>
-                    <span className={`${styles.answerDisclaimer} ${styles.mobileAnswerDisclaimer}`}>AI-generated content may be incorrect</span>
-                </Stack.Item>
-
-                {!!parsedAnswer.citations.length && (
-                    <Stack.Item aria-label="References">
-                        <Stack style={{width: "100%"}} >
-                            <Stack horizontal horizontalAlign='start' verticalAlign='center'>
-                                <Text
-                                    className={styles.accordionTitle}
-                                    onClick={toggleIsRefAccordionOpen}
-                                >
-                                <span>{parsedAnswer.citations.length > 1 ? parsedAnswer.citations.length + " references" : "1 reference"}</span>
-                                </Text>
-                                <FontIcon className={styles.accordionIcon}
-                                onClick={handleChevronClick} iconName={chevronIsExpanded ? 'ChevronDown' : 'ChevronRight'}
-                                />
-                            </Stack>
-
-                        </Stack>
-                    </Stack.Item>
-                )}
+  }, [index, synthesizerData]);
 
-                </Stack>
-                {chevronIsExpanded &&
-                    <div ref={refContainer} style={{ marginTop: 8, display: "flex", flexDirection: "column", height: "100%", gap: "4px", maxWidth: "100%" }}>
-                        {parsedAnswer.citations.map((citation, idx) => {
-                            return (
-                                <span title={createCitationFilepath(citation, ++idx)} key={idx} onClick={() => onCitationClicked(citation)} className={styles.citationContainer}>
-                                    <div className={styles.citation} key={idx}>{idx}</div>
-                                    {createCitationFilepath(citation, idx, true)}
-                                </span>);
-                        })}
-                    </div>
-                }
-            </Stack>
-        </>
+  useEffect(() => {
+    const fetchSythesizerData = async () => {
+      const response = await fetch('/api/speech');
+      if (!response.ok) {
+        throw new Error('Network response was not ok');
+      }
+      const data = await response.json();
+      setSynthesizerData({ key: data.key, region: data.region });
+    }
+    fetchSythesizerData();
+  }, [])
+
+  useEffect(() => {
+    if (!isActive && synthesizer && isSpeaking) {
+      resetSpeech()
+    }
+  }, [isActive, synthesizer]);
+
+  useEffect(() => {
+    setChevronIsExpanded(isRefAccordionOpen);
+    if (chevronIsExpanded && refContainer.current) {
+      refContainer.current.scrollIntoView({ behavior: 'smooth' });
+    }
+    // After genrating answer then only show speaker icon
+    if (parsedAnswer.markdownFormatText === "Generating answer...") {
+      setShowSpeaker(false);
+    } else {
+      setShowSpeaker(true);
+    }
+  }, [chevronIsExpanded, isRefAccordionOpen, parsedAnswer]);
+
+  const createCitationFilepath = (citation: Citation, index: number, truncate: boolean = false) => {
+    let citationFilename = "";
+
+    if (citation.filepath && citation.chunk_id != null) {
+      if (truncate && citation.filepath.length > filePathTruncationLimit) {
+        const citationLength = citation.filepath.length;
+        citationFilename = `${citation.filepath.substring(0, 20)}...${citation.filepath.substring(citationLength - 20)} - Part ${citation.chunk_id}`;
+      }
+      else {
+        citationFilename = `${citation.filepath} - Part ${citation.chunk_id}`;
+      }
+    }
+    else {
+      citationFilename = `Citation ${index}`;
+    }
+    return citationFilename;
+  }
+
+  const getAnswerText = () => {
+    if (answerContainerRef.current) {
+      const text = answerContainerRef.current.textContent ?? '';
+      return text;
+    }
+    return '';
+  };
+
+  const startSpeech = () => {
+    if (synthesizer) {
+      const text = getAnswerText();
+      synthesizer?.speakTextAsync(
+        text,
+        result => {
+          if (result.reason === SpeechSDK.ResultReason.SynthesizingAudioCompleted) {
+            const duration = result.audioDuration / 10000;
+            setRemainingDuration(duration);
+            setStartTime(Date.now());
+            handleTimeout(duration);
+          } else if (result.reason === SpeechSDK.ResultReason.Canceled) {
+            setIsSpeaking(false);
+            setIsPaused(false);
+          } else {
+            console.error('Synthesis failed: ', result.errorDetails);
+          }
+        },
+        error => {
+          console.error('Synthesis error: ', error);
+          setIsSpeaking(false);
+          setIsPaused(false);
+        }
+      );
+      setIsSpeaking(true);
+    }
+  };
+
+  const handleTimeout = (remainingDuration: number) => {
+    setPlaybackTimeout(
+      setTimeout(() => {
+        setIsSpeaking(false);
+        setIsPaused(false);
+        onSpeak(index , 'stop');
+      }, remainingDuration)
     );
+  };
+
+  const resetSpeech = () => {
+    //audioDestination?.close();
+    audioDestination?.pause();
+    setIsSpeaking(false);
+    setIsPaused(false);
+    //synthesizer?.close();
+    initializeSynthesizer();
+  }
+  const handleSpeakPauseResume = () => {
+    if (isSpeaking) {
+      if (isPaused) {
+        onSpeak(index , 'speak');
+        audioDestination?.resume();
+        setIsPaused(false);
+        setStartTime(Date.now());
+        handleTimeout(remainingDuration);
+      } else {
+        onSpeak(index , 'pause');
+        audioDestination?.pause();
+        setIsPaused(true);
+        const elapsed = Date.now() - (startTime || 0);
+        const newRemainingDuration = remainingDuration - elapsed;
+        setRemainingDuration(newRemainingDuration);
+        if (playbackTimeout) {
+          clearTimeout(playbackTimeout);
+        }
+      }
+    } else {
+      onSpeak(index , 'speak');
+      startSpeech();
+    }
+  };
+
+  useEffect(() => {
+    const handleCopy = () => {
+      alert("Please consider where you paste this content.");
+    };
+    const messageBox = document.getElementById(messageBoxId);
+    messageBox?.addEventListener("copy", handleCopy);
+    return () => {
+      messageBox?.removeEventListener("copy", handleCopy);
+    };
+  }, []);
+
+  const getSpeechButtons = () => {
+    const speechStatus = !showSpeaker ? "none" : showSpeaker && !isSpeaking ? "Speak"
+      : isSpeaking && isPaused ? "Resume" : "Pause";
+      
+    switch (speechStatus) {
+      case 'Speak':
+      case 'Resume':
+        return (
+          <button id="speakerbtn" title={"Read aloud"} onClick={handleSpeakPauseResume} style={{ border: 0, backgroundColor: 'transparent' }}>
+            <img src={speakerIcon} alt="Speak" />
+          </button>
+        )
+      case 'Pause':
+        return (
+          <button id="pausebtn" title={"Pause"} onClick={handleSpeakPauseResume} style={{ border: 0, backgroundColor: 'transparent' }} >
+            <img src={pauseIcon} alt={isPaused ? 'Resume' : 'Pause'} />
+          </button>
+        )
+      default:
+        return null;
+    }
+  }
+
+  return (
+    <>
+      <MyStackComponent className={styles.answerContainer} id={messageBoxId} ref={answerContainerRef}>
+        <Stack.Item grow>
+          <ReactMarkdown
+            remarkPlugins={[remarkGfm, supersub]}
+            children={parsedAnswer.markdownFormatText}
+            className={styles.answerText}
+          />
+        </Stack.Item>
+        <Stack horizontal className={styles.answerFooter} verticalAlign="start">
+          <Stack.Item className={styles.answerDisclaimerContainer}>
+            <span className={`${styles.answerDisclaimer} ${styles.mobileAnswerDisclaimer}`}>AI-generated content may be incorrect</span>
+          </Stack.Item>
+
+          {!!parsedAnswer.citations.length && (
+            <Stack.Item aria-label="References">
+              <Stack style={{ width: "100%" }} >
+                <Stack horizontal horizontalAlign='start' verticalAlign='center'>
+                  <Text
+                    className={styles.accordionTitle}
+                    onClick={toggleIsRefAccordionOpen}
+                  >
+                    <span>{parsedAnswer.citations.length > 1 ? parsedAnswer.citations.length + " references" : "1 reference"}</span>
+                  </Text>
+                  <FontIcon className={styles.accordionIcon}
+                    onClick={handleChevronClick} iconName={chevronIsExpanded ? 'ChevronDown' : 'ChevronRight'}
+                  />
+                </Stack>
+
+              </Stack>
+            </Stack.Item>
+          )}
+
+        </Stack>
+        {chevronIsExpanded &&
+          <div ref={refContainer} style={{ marginTop: 8, display: "flex", flexDirection: "column", height: "100%", gap: "4px", maxWidth: "100%" }}>
+            {parsedAnswer.citations.map((citation, idx) => {
+              return (
+                <span title={createCitationFilepath(citation, ++idx)} key={idx} onClick={() => onCitationClicked(citation)} className={styles.citationContainer}>
+                  <div className={styles.citation} key={idx}>{idx}</div>
+                  {createCitationFilepath(citation, idx, true)}
+                </span>);
+            })}
+          </div>
+        }
+        <Stack.Item>
+          {getSpeechButtons()}
+        </Stack.Item>
+      </MyStackComponent>
+    </>
+  );
 };
diff --git a/code/frontend/src/components/QuestionInput/QuestionInput.module.css b/code/frontend/src/components/QuestionInput/QuestionInput.module.css
index 5788340fd..146d075c1 100644
--- a/code/frontend/src/components/QuestionInput/QuestionInput.module.css
+++ b/code/frontend/src/components/QuestionInput/QuestionInput.module.css
@@ -27,7 +27,7 @@
     right: 24px;
     bottom: 20px;
     position: static;
-    margin-right: 12px;
+    /* margin-right: 12px; */
 }
 
 .questionInputSendButton {
@@ -39,7 +39,7 @@
     pointer-events: none;
     width: 24px;
     height: 23px;
-    margin-right: 12px;
+    /* margin-right: 12px; */
 }
 
 .questionInputSendButtonDisabled {
@@ -75,6 +75,8 @@
     width: 24px;
     height: 24px;
     margin-bottom: 12px;
+    border: none;
+    background: transparent;
   }
 
   .microphoneAndSendContainer {
diff --git a/code/frontend/src/components/QuestionInput/QuestionInput.tsx b/code/frontend/src/components/QuestionInput/QuestionInput.tsx
index 0d3bd8e8d..6a8cbef6f 100644
--- a/code/frontend/src/components/QuestionInput/QuestionInput.tsx
+++ b/code/frontend/src/components/QuestionInput/QuestionInput.tsx
@@ -17,6 +17,7 @@ interface Props {
   recognizedText: string;
   isListening: boolean;
   isRecognizing: boolean;
+  isTextToSpeachActive : boolean;
   setRecognizedText: (text: string) => void;
 }
 
@@ -32,11 +33,13 @@ export const QuestionInput = ({
   isListening,
   isRecognizing,
   setRecognizedText,
+  isTextToSpeachActive
 }: Props) => {
   const [question, setQuestion] = useState<string>("");
   const [liveRecognizedText, setLiveRecognizedText] = useState<string>("");
   const [microphoneIconActive, setMicrophoneIconActive] =
     useState<boolean>(false);
+  const [isMicrophoneDisabled , setIsMicrophoneDisabled] = useState(false);
   const [isTextAreaDisabled, setIsTextAreaDisabled] = useState(false);
   useEffect(() => {
     if (isRecognizing) {
@@ -48,6 +51,9 @@ export const QuestionInput = ({
       setMicrophoneIconActive(false); // Set microphone icon to inactive
     }
   }, [recognizedText, isRecognizing]);
+  useEffect(()=>{
+    setIsMicrophoneDisabled(isTextToSpeachActive);
+  },[isTextToSpeachActive])
   const sendQuestion = () => {
     if (disabled || (!question.trim() && !liveRecognizedText.trim())) {
       return;
@@ -103,12 +109,14 @@ export const QuestionInput = ({
       />
       <div className={styles.microphoneAndSendContainer}>
         {/* Microphone Icon */}
-        <div
+        <button 
+        type="button"
+          disabled={(isMicrophoneDisabled) ? true : false}
           className={styles.questionInputMicrophone}
-          onClick={isListening ? onStopClick : onMicrophoneClick}
+          onClick={(isListening) ? onStopClick : onMicrophoneClick}
           onKeyDown={(e) =>
             e.key === "Enter" || e.key === " "
-              ? isListening
+              ? (isListening)
                 ? onStopClick()
                 : onMicrophoneClick()
               : null
@@ -117,20 +125,20 @@ export const QuestionInput = ({
           tabIndex={0}
           aria-label="Microphone button"
         >
-          {microphoneIconActive ? (
+          {microphoneIconActive || isMicrophoneDisabled ? (
             <FontAwesomeIcon
               icon={faMicrophone}
               className={styles.microphoneIconActive}
-              style={{ color: "blue" }}
+              style={{ color: isMicrophoneDisabled ? "lightgray" : "blue" }}
             />
           ) : (
-            <img
+              <img
               src={MicrophoneIcon}
               className={styles.microphoneIcon}
               alt="Microphone"
             />
           )}
-        </div>
+        </button>
 
         {/* Send Button */}
         {isSendButtonDisabled?( <SendRegular className={styles.SendButtonDisabled} />):(
diff --git a/code/frontend/src/pages/chat/Chat.tsx b/code/frontend/src/pages/chat/Chat.tsx
index 9f415caba..14b68a670 100644
--- a/code/frontend/src/pages/chat/Chat.tsx
+++ b/code/frontend/src/pages/chat/Chat.tsx
@@ -60,6 +60,8 @@ const Chat = () => {
   const [isListening, setIsListening] = useState(false);
   const recognizerRef = useRef<SpeechRecognizer | null>(null);
   const [assistantType, setAssistantType] = useState("");
+  const [activeCardIndex, setActiveCardIndex] = useState<number | null>(null);
+  const [isTextToSpeachActive , setIsTextToSpeachActive] = useState(false);
 
   const makeApiRequest = async (question: string) => {
     lastQuestionRef.current = question;
@@ -261,6 +263,13 @@ const Chat = () => {
     return [];
   };
 
+  const handleSpeech = (index: number, status : string) => {
+    if(status != 'pause')
+    setActiveCardIndex(index);
+    setIsTextToSpeachActive(status =='speak' ? true : false)
+  };
+
+
   return (
     <div className={styles.container}>
       <Stack horizontal className={styles.chatRoot} >
@@ -314,6 +323,8 @@ const Chat = () => {
                               ? parseCitationFromMessage(answers[index - 1])
                               : [],
                         }}
+                        onSpeak={handleSpeech}
+                        isActive={activeCardIndex === index}
                         onCitationClicked={(c) => onShowCitation(c)}
                         index={index}
                       />
@@ -399,6 +410,7 @@ const Chat = () => {
               isListening={isListening}
               isRecognizing={isRecognizing}
               setRecognizedText={setRecognizedText}
+              isTextToSpeachActive = {isTextToSpeachActive}
             />
           </Stack>
         </div>
diff --git a/code/tests/functional/tests/backend_api/default/test_speech_token.py b/code/tests/functional/tests/backend_api/default/test_speech_token.py
index 1388b0cd5..401f92191 100644
--- a/code/tests/functional/tests/backend_api/default/test_speech_token.py
+++ b/code/tests/functional/tests/backend_api/default/test_speech_token.py
@@ -20,6 +20,7 @@ def test_speech_token_returned(app_url: str, app_config: AppConfig):
         "token": "speech-token",
         "region": app_config.get("AZURE_SPEECH_SERVICE_REGION"),
         "languages": app_config.get("AZURE_SPEECH_RECOGNIZER_LANGUAGES").split(","),
+        "key": "some-azure-speech-service-key"
     }
     assert response.headers["Content-Type"] == "application/json"
 
diff --git a/code/tests/test_app.py b/code/tests/test_app.py
index b671dbb65..75deba2ce 100644
--- a/code/tests/test_app.py
+++ b/code/tests/test_app.py
@@ -110,6 +110,7 @@ def test_returns_speech_token_using_keys(
             "token": "speech-token",
             "region": AZURE_SPEECH_SERVICE_REGION,
             "languages": AZURE_SPEECH_RECOGNIZER_LANGUAGES,
+            "key": "mock-speech-key"
         }
 
         requests.post.assert_called_once_with(
@@ -153,6 +154,7 @@ def test_returns_speech_token_using_rbac(
             "token": "speech-token",
             "region": AZURE_SPEECH_SERVICE_REGION,
             "languages": AZURE_SPEECH_RECOGNIZER_LANGUAGES,
+            "key": "mock-key1"
         }
 
         requests.post.assert_called_once_with(