Wednesday, January 29, 2025

Azure AI services - Recognize and synthesize speech

Azure AI services - Recognize and synthesize speech:

1. Create 'Speech service' in Azure

C# Code:

using System;
using System.Threading.Tasks;
using Microsoft.Extensions.Configuration;
using System.Media;

// Import namespaces
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;

// dotnet add package Microsoft.CognitiveServices.Speech --version 1.30.0
// dotnet add package System.Windows.Extensions --version 4.6.0

namespace speaking_clock
{
    class Program
    {
        private static SpeechConfig speechConfig;
        static async Task Main(string[] args)
        {
            try
            {
                // Get config settings from AppSettings
                // IConfigurationBuilder builder =                 new ConfigurationBuilder().AddJsonFile("appsettings.json");
                // IConfigurationRoot configuration = builder.Build();
                string aiSvcKey = "C7A2c2oHTErWxabILyi7SCucDFXJ3w3AAAYACOGSRaz";                 //configuration["SpeechKey"];
                string aiSvcRegion = "eastus"; // configuration["SpeechRegion"];

                // Configure speech service
                speechConfig = SpeechConfig.FromSubscription(aiSvcKey, aiSvcRegion);
                Console.WriteLine("Ready to use speech service in " + speechConfig.Region);

                // Configure voice
                speechConfig.SpeechSynthesisVoiceName = "en-US-AriaNeural";

                // Get spoken input
                string command = "";
                command = "what time is it?"; //await TranscribeCommand();
                if (command.ToLower() == "what time is it?")
                {
                    await TellTime();
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }
        }

        static async Task<string> TranscribeCommand()
        {
            string command = "";

            // Configure speech recognition
            using AudioConfig audioConfig = AudioConfig.FromDefaultMicrophoneInput();
            using SpeechRecognizer speechRecognizer =             new SpeechRecognizer(speechConfig, audioConfig);
            Console.WriteLine("Speak now...");

            // Configure speech recognition
            // string audioFile = "time.wav";
            // SoundPlayer wavPlayer = new SoundPlayer(audioFile);
            // wavPlayer.Play();
            // using AudioConfig audioConfig = AudioConfig.FromWavFileInput(audioFile);
            // using SpeechRecognizer speechRecognizer =             new SpeechRecognizer(speechConfig, audioConfig);

            // Process speech input
            SpeechRecognitionResult speech = await speechRecognizer.RecognizeOnceAsync();
            if (speech.Reason == ResultReason.RecognizedSpeech)
            {
                command = speech.Text;
                Console.WriteLine(command);
            }
            else
            {
                Console.WriteLine(speech.Reason);
                if (speech.Reason == ResultReason.Canceled)
                {
                    var cancellation = CancellationDetails.FromResult(speech);
                    Console.WriteLine(cancellation.Reason);
                    Console.WriteLine(cancellation.ErrorDetails);
                }
            }

            // Return the command
            return command;
        }

        static async Task TellTime()
        {
            var now = DateTime.Now;
            string responseText = "The time is " + now.Hour.ToString() + ":" +             now.Minute.ToString("D2");

            // Configure speech synthesis
            speechConfig.SpeechSynthesisVoiceName = "en-GB-RyanNeural";
            using SpeechSynthesizer speechSynthesizer = new SpeechSynthesizer(speechConfig);

            // Synthesize spoken output
            string responseSsml = $@"
                <speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'>
                    <voice name='en-GB-LibbyNeural'>
                        {responseText}
                        <break strength='weak'/>
                        Time to end this lab!
                    </voice>
                </speak>";
            SpeechSynthesisResult speak = await speechSynthesizer.SpeakSsmlAsync(responseSsml);
            if (speak.Reason != ResultReason.SynthesizingAudioCompleted)
            {
                Console.WriteLine(speak.Reason);
            }

            // Print the response
            Console.WriteLine(responseText);
        }
    }
}

OutPut:



Python Code:

from dotenv import load_dotenv  
from datetime import datetime  
from playsound import playsound
import os  

# Import namespaces  
import azure.cognitiveservices.speech as speech_sdk  

# pip install azure-cognitiveservices-speech==1.30.0
# pip install python-dotenv
# pip install playsound==1.2.2

def main():  
    try:  
        global speech_config  

        # Get Configuration Settings  
        load_dotenv()  
        ai_key = 'C7A2c2oHTErWxabILyi7SBAACYeBjFXJ3w3AAAYACOGSRaz'  # os.getenv('SPEECH_KEY')  
        ai_region = 'eastus'  # os.getenv('SPEECH_REGION')  

        # Configure speech service  
        speech_config = speech_sdk.SpeechConfig(subscription=ai_key, region=ai_region)  
        print('Ready to use speech service in:', speech_config.region)  

        # Get spoken input  
        command =  'what time is it?' #TranscribeCommand()  
        if command.lower() == 'what time is it?':  
            TellTime()  

    except Exception as ex:  
        print(ex)  

def TranscribeCommand():  
    command = ''  

    # Configure speech recognition  
    # audio_config = speech_sdk.AudioConfig(use_default_microphone=True)  
    # speech_recognizer =     speech_sdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)  
    # print('Speak now...')  
   
     # Configure speech recognition
    current_dir = os.getcwd()
    audioFile = current_dir + '\\time.wav'
    playsound(audioFile)
    audio_config = speech_sdk.AudioConfig(filename=audioFile)
    speech_recognizer = speech_sdk.SpeechRecognizer(speech_config, audio_config)

    # Process speech input  
    speech = speech_recognizer.recognize_once_async().get()  

    if speech.reason == speech_sdk.ResultReason.RecognizedSpeech:  
        command = speech.text  
        print(command)  
    else:  
        print(speech.reason)  
        if speech.reason == speech_sdk.ResultReason.Canceled:  
            cancellation = speech.cancellation_details  
            print(cancellation.reason)  
            print(cancellation.error_details)  

    # Return the command  
    return command  

def TellTime():  
    now = datetime.now()  
    response_text = 'The time is {}:{:02d}'.format(now.hour, now.minute)  

    # Configure speech synthesis  
    speech_config.speech_synthesis_voice_name = "en-GB-RyanNeural"  
    speech_synthesizer = speech_sdk.SpeechSynthesizer(speech_config=speech_config)  

    # Synthesize spoken output  
    # speak = speech_synthesizer.speak_text_async(response_text).get()  
    # if speak.reason != speech_sdk.ResultReason.SynthesizingAudioCompleted:  
    #     print(speak.reason)
   
     # Synthesize spoken output
    responseSsml = " \
        <speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'> \
             <voice name='en-GB-LibbyNeural'> \
                 {} \
                <break strength='weak'/> \
                Time to end this lab! \
            </voice> \
        </speak>".format(response_text)
    speak = speech_synthesizer.speak_ssml_async(responseSsml).get()
   
    if speak.reason != speech_sdk.ResultReason.SynthesizingAudioCompleted:
        print(speak.reason)

    # Print the response  
    print(response_text)  

if __name__ == "__main__":  
    main()

OutPut:





No comments:

Post a Comment

Featured Post

Building Secure APIs with FastAPI and Azure AD Authentication

Building Secure APIs with FastAPI and Azure AD Authentication Published on September 2, 2025 In today's world of microservices and API-f...

Popular posts