Does live chat avatar synthesis support WordBoundary events?
Mindaugas Giedraitis
0
Reputation points
I was trying to set up the WordBoundary event callback for my live chat avatar synthesis but the callback is never run (the avatar speaks in the front-end but I get no events).
That brings me to the question - are these events even supported for live chat avatar?
If so, what am I doing wrong?
PS. the voice I am using is "en-US-JennyMultilingualNeural" and here's my code to set up the synthesizer and its connection objects.
def _set_up_speech_synthesizer(
self,
connection: AvatarConnection,
session_description: RTCConnectionDescription,
) -> None:
speech_config = speechsdk.SpeechConfig(
# self.speech_wss_endpoint_tts = "wss://westeurope.tts.speech.microsoft.com/cognitiveservices/websocket/v1?enableTalkingAvatar=true"
endpoint=f"{self.speech_wss_endpoint_tts}"
)
speech_config.authorization_token = connection.speech_token
# Required for WordBoundary event sentences.
speech_config.set_property(
property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestWordBoundary,
value="true",
)
speech_synthesizer = speechsdk.SpeechSynthesizer(
speech_config=speech_config,
)
def speech_synthesizer_word_boundary_callback(
event: speechsdk.SpeechSynthesisWordBoundaryEventArgs,
):
print("WordBoundary event:")
print("\tBoundaryType: {}".format(event.boundary_type))
print("\tAudioOffset: {}ms".format((event.audio_offset + 5000) / 10000))
print("\tDuration: {}".format(event.duration))
print("\tText: {}".format(event.text))
print("\tTextOffset: {}".format(event.text_offset))
print("\tWordLength: {}".format(event.word_length))
speech_synthesizer.synthesis_word_boundary.connect(speech_synthesizer_word_boundary_callback)
print(f"synthesis_word_boundary callback connected: {speech_synthesizer.synthesis_word_boundary.is_connected()}")
connection.speech_synthesizer = speech_synthesizer
ice_token_obj = json.loads(connection.ice_token)
avatar_config = self._create_avatar_config(
session_description=session_description,
url=ice_token_obj["Urls"][0],
username=ice_token_obj["Username"],
password=ice_token_obj["Password"],
)
speech_synthesizer_connection = speechsdk.Connection.from_speech_synthesizer(
connection.speech_synthesizer
)
speech_synthesizer_connection.connected.connect(
lambda evt: print("TTS Avatar service connected.")
)
def tts_disconnected_callback(event: speechsdk.ConnectionEventArgs):
print("TTS Avatar service disconnected.")
connection.speech_synthesizer_connection = None
connection.speech_synthesizer_connected = False
speech_synthesizer_connection.disconnected.connect(tts_disconnected_callback)
speech_synthesizer_connection.set_message_property(
"speech.config", "context", json.dumps(avatar_config)
)
connection.speech_synthesizer_connection = speech_synthesizer_connection
connection.speech_synthesizer_connected = True
speech_sythesis_result = connection.speech_synthesizer.speak_text_async(
""
).get()
if speech_sythesis_result is None:
raise Exception(
f"Speech synthesis result is None for connection {connection.connection_id}"
)
print(f"Result id for avatar connection: {speech_sythesis_result.result_id}")
if speech_sythesis_result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = speech_sythesis_result.cancellation_details
print(f"Speech synthesis canceled: {cancellation_details.reason}")
if cancellation_details.reason == speechsdk.CancellationReason.Error:
connection.status = AvatarConnectionStatus.FAILED
print(f"Error details: {cancellation_details.error_details}")
raise Exception(cancellation_details.error_details)
turn_start_message = (
connection.speech_synthesizer.properties.get_property_by_name(
"SpeechSDKInternal-ExtraTurnStartMessage"
)
)
if not turn_start_message:
raise Exception(
f"Turn start message is empty for connection {connection.connection_id}"
)
try:
connection.speech_synthesizer_remote_sdp = json.loads(turn_start_message)[
"webrtc"
]["connectionString"]
except Exception as e:
print(
f"Error parsing turn start message: {turn_start_message}, {type(turn_start_message)}"
)
raise Exception(f"Error parsing turn start message: {e}") from e
Azure AI Speech
Azure AI Speech
An Azure service that integrates speech processing into apps and services.
Sign in to answer