Does live chat avatar synthesis support WordBoundary events?

Mindaugas Giedraitis 0 Reputation points
2025-08-07T13:03:07.6866667+00:00

I was trying to set up the WordBoundary event callback for my live chat avatar synthesis but the callback is never run (the avatar speaks in the front-end but I get no events).

That brings me to the question - are these events even supported for live chat avatar?

If so, what am I doing wrong?

PS. the voice I am using is "en-US-JennyMultilingualNeural" and here's my code to set up the synthesizer and its connection objects.

def _set_up_speech_synthesizer(
        self,
        connection: AvatarConnection,
        session_description: RTCConnectionDescription,
    ) -> None:
        speech_config = speechsdk.SpeechConfig(
            # self.speech_wss_endpoint_tts = "wss://westeurope.tts.speech.microsoft.com/cognitiveservices/websocket/v1?enableTalkingAvatar=true"
            endpoint=f"{self.speech_wss_endpoint_tts}"
        )

        speech_config.authorization_token = connection.speech_token
        # Required for WordBoundary event sentences.
        speech_config.set_property(
            property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestWordBoundary,
            value="true",
        )

        speech_synthesizer = speechsdk.SpeechSynthesizer(
            speech_config=speech_config,
        )

        def speech_synthesizer_word_boundary_callback(
            event: speechsdk.SpeechSynthesisWordBoundaryEventArgs,
        ):
            print("WordBoundary event:")
            print("\tBoundaryType: {}".format(event.boundary_type))
            print("\tAudioOffset: {}ms".format((event.audio_offset + 5000) / 10000))
            print("\tDuration: {}".format(event.duration))
            print("\tText: {}".format(event.text))
            print("\tTextOffset: {}".format(event.text_offset))
            print("\tWordLength: {}".format(event.word_length))

        speech_synthesizer.synthesis_word_boundary.connect(speech_synthesizer_word_boundary_callback)
        print(f"synthesis_word_boundary callback connected: {speech_synthesizer.synthesis_word_boundary.is_connected()}")

        connection.speech_synthesizer = speech_synthesizer
        ice_token_obj = json.loads(connection.ice_token)
        avatar_config = self._create_avatar_config(
            session_description=session_description,
            url=ice_token_obj["Urls"][0],
            username=ice_token_obj["Username"],
            password=ice_token_obj["Password"],
        )
        speech_synthesizer_connection = speechsdk.Connection.from_speech_synthesizer(
            connection.speech_synthesizer
        )
        speech_synthesizer_connection.connected.connect(
            lambda evt: print("TTS Avatar service connected.")
        )

        def tts_disconnected_callback(event: speechsdk.ConnectionEventArgs):
            print("TTS Avatar service disconnected.")
            connection.speech_synthesizer_connection = None
            connection.speech_synthesizer_connected = False

        speech_synthesizer_connection.disconnected.connect(tts_disconnected_callback)
        speech_synthesizer_connection.set_message_property(
            "speech.config", "context", json.dumps(avatar_config)
        )
        connection.speech_synthesizer_connection = speech_synthesizer_connection
        connection.speech_synthesizer_connected = True

        speech_sythesis_result = connection.speech_synthesizer.speak_text_async(
            ""
        ).get()
        if speech_sythesis_result is None:
            raise Exception(
                f"Speech synthesis result is None for connection {connection.connection_id}"
            )
        print(f"Result id for avatar connection: {speech_sythesis_result.result_id}")
        if speech_sythesis_result.reason == speechsdk.ResultReason.Canceled:
            cancellation_details = speech_sythesis_result.cancellation_details
            print(f"Speech synthesis canceled: {cancellation_details.reason}")
            if cancellation_details.reason == speechsdk.CancellationReason.Error:
                connection.status = AvatarConnectionStatus.FAILED
                print(f"Error details: {cancellation_details.error_details}")
                raise Exception(cancellation_details.error_details)
        turn_start_message = (
            connection.speech_synthesizer.properties.get_property_by_name(
                "SpeechSDKInternal-ExtraTurnStartMessage"
            )
        )
        if not turn_start_message:
            raise Exception(
                f"Turn start message is empty for connection {connection.connection_id}"
            )
        try:
            connection.speech_synthesizer_remote_sdp = json.loads(turn_start_message)[
                "webrtc"
            ]["connectionString"]
        except Exception as e:
            print(
                f"Error parsing turn start message: {turn_start_message}, {type(turn_start_message)}"
            )
            raise Exception(f"Error parsing turn start message: {e}") from e

Azure AI Speech
Azure AI Speech
An Azure service that integrates speech processing into apps and services.
{count} votes

Your answer

Answers can be marked as Accepted Answers by the question author, which helps users to know the answer solved the author's problem.