Skip to content
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion include/ctrlm_ipc.h
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,8 @@ typedef enum {
CTRLM_RCU_IARM_EVENT_RF4CE_PAIRING_WINDOW_TIMEOUT = 35, ///< Indicates that a battery milestone event occured
CTRLM_RCU_IARM_EVENT_FIRMWARE_UPDATE_PROGRESS = 36, ///< Generated when an milestone is reached for remote firmware upgrade
CTRLM_RCU_IARM_EVENT_VALIDATION_STATUS = 37, ///< Generated when the validation status changes
CTRLM_MAIN_IARM_EVENT_MAX = 38 ///< Placeholder for the last event (used in registration)
CTRLM_VOICE_IARM_EVENT_SESSION_SILENT = 38, ///< Voice session was silent (no speech detected)
CTRLM_MAIN_IARM_EVENT_MAX = 39 ///< Placeholder for the last event (used in registration)
} ctrlm_main_iarm_event_t;

/// @brief Remote Control Key Status
Expand Down
11 changes: 11 additions & 0 deletions include/ctrlm_ipc_voice.h
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,16 @@ typedef struct {
long return_code_internal; ///< Internally generated return code
} ctrlm_voice_iarm_event_session_short_t;

typedef struct {
unsigned char api_revision; ///< The revision of this API.
ctrlm_network_id_t network_id; ///< Identifier of network on which the controller is bound
ctrlm_network_type_t network_type; ///< Type of network on which the controller is bound
ctrlm_controller_id_t controller_id; ///< A unique identifier of the remote
unsigned long session_id; ///< A unique id for the voice session.
ctrlm_voice_session_end_reason_t reason; ///< The reason that the voice session was silent
Comment thread
dwolaver marked this conversation as resolved.
long return_code_internal; ///< Internally generated return code
} ctrlm_voice_iarm_event_session_silent_t;

typedef struct {
unsigned char api_revision; ///< The revision of this API
char media_service_url[2083]; ///< The url for the media service (null terminated string)
Expand Down Expand Up @@ -376,6 +386,7 @@ typedef struct {
/// | CTRLM_MAIN_IARM_BUS_NAME | CTRLM_VOICE_IARM_EVENT_SESSION_STATS | ctrlm_voice_iarm_event_session_stats_t * | Generated when the statistics of the voice session are available |
/// | CTRLM_MAIN_IARM_BUS_NAME | CTRLM_VOICE_IARM_EVENT_SESSION_ABORT | ctrlm_voice_iarm_event_session_abort_t * | Generated when a voice session is aborted (denied) |
/// | CTRLM_MAIN_IARM_BUS_NAME | CTRLM_VOICE_IARM_EVENT_SESSION_SHORT | ctrlm_voice_iarm_event_session_short_t * | Generated when a short voice session is detected |
/// | CTRLM_MAIN_IARM_BUS_NAME | CTRLM_VOICE_IARM_EVENT_SESSION_SILENT | ctrlm_voice_iarm_event_session_silent_t * | Generated when a silent voice session is detected |
/// | CTRLM_MAIN_IARM_BUS_NAME | CTRLM_VOICE_IARM_EVENT_MEDIA_SERVICE | ctrlm_voice_iarm_event_media_service_t * | Generated when a media service response is received |
///
/// IARM events are available on a subscription basis. In order to receive an event, a client must explicitly register to receive the event by calling
Expand Down
4 changes: 3 additions & 1 deletion src/ctrlm_config_default.json
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,9 @@
"par_voice_eos_method" : 1,
"par_voice_eos_timeout" : 2500,
"server_hosts" : [],
"telemetry_session_stats" : false
"telemetry_session_stats" : false,
"voice_activity_detection_mode" : "enabled"

},
"device_update" : {
"dir_root" : "/srv/device_update/",
Expand Down
16 changes: 11 additions & 5 deletions src/telemetry/ctrlm_telemetry_markers.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@
// The Voice Session Statistics Marker reports statistics for the voice session(s). The format of the marker is a json array of arrays with each event in the format below:
//
// [[event1], [event2], [event3], ...]
// [<version>,<device_type>,<device_version>,<encoding>,<interaction_mode>,<time_prev_session>,<time_start_lag>,<time_stream_len_exp>,<time_stream_len_act>,<time_stream_delta>,<packets_total>,<packets_lost>,<samples_total>,<samples_lost>,<decoder_failures>,<samples_buffered_max>,<stream_ret_code>,<protocol_ret_code>,<server_ret_code>,<server_message>,<result>]
// [<version>,<device_type>,<device_version>,<encoding>,<interaction_mode>,<time_prev_session>,<time_start_lag>,<time_stream_len_exp>,<time_stream_len_act>,<time_stream_delta>,<packets_total>,<packets_lost>,<samples_total>,<samples_lost>,<decoder_failures>,<samples_buffered_max>,<end_reason_rcu>,<end_reason_session>,<end_reason_server>,<server_message>,<result>,<end_reason_stream>,<ret_code_protocol>,<voice_detected>,<peak_confidence>,<peak_rms_level>]
//
// <version> - Version of the marker format.
// <device_type> - Name of the device that started the session.
Expand All @@ -111,13 +111,19 @@
// <samples_lost> - lost samples.
// <decoder_failures> - decoder failure count.
// <samples_buffered_max> - sample buffer high watermark.
// <stream_ret_code> - audio stream success/error code.
// <protocol_ret_code> - protocol success/error code.
// <server_ret_code> - server success/error code.
// <end_reason_rcu> - audio stream success/error code.
// <end_reason_session> - session success/error code.
// <end_reason_server> - server success/error code.
// <server_message> - server message.
// <result> - flag to indicate if session was successful.
// <end_reason_stream> - reason why the stream ended (if available).
// <ret_code_protocol> - protocol return code.
// <voice_detected> - flag to indicate if voice was detected.
Comment thread
dwolaver marked this conversation as resolved.
Outdated
// <peak_confidence> - peak confidence level.
// <peak_rms_level> - peak RMS level.

#define MARKER_VOICE_SESSION_STATS "ctrlm.voice.session.stats"
#define MARKER_VOICE_SESSION_STATS_VERSION "2"
#define MARKER_VOICE_SESSION_STATS_VERSION "3"

// End Voice Session Statistics

Expand Down
111 changes: 76 additions & 35 deletions src/voice/ctrlm_voice_obj.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,39 +186,40 @@ ctrlm_voice_t::ctrlm_voice_t() {
#ifdef JSON_ARRAY_VAL_STR_VOICE_SERVER_HOSTS_3
this->url_hostname_pattern_add(JSON_ARRAY_VAL_STR_VOICE_SERVER_HOSTS_3);
#endif
this->prefs.aspect_ratio = JSON_STR_VALUE_VOICE_ASPECT_RATIO;
this->prefs.guide_language = JSON_STR_VALUE_VOICE_LANGUAGE;
this->prefs.app_id_http = JSON_STR_VALUE_VOICE_APP_ID_HTTP;
this->prefs.app_id_ws = JSON_STR_VALUE_VOICE_APP_ID_WS;
this->prefs.timeout_vrex_connect = JSON_INT_VALUE_VOICE_VREX_REQUEST_TIMEOUT;
this->prefs.timeout_vrex_session = JSON_INT_VALUE_VOICE_VREX_RESPONSE_TIMEOUT;
this->prefs.timeout_stats = JSON_INT_VALUE_VOICE_TIMEOUT_STATS;
this->prefs.timeout_packet_initial = JSON_INT_VALUE_VOICE_TIMEOUT_PACKET_INITIAL;
this->prefs.timeout_packet_subsequent = JSON_INT_VALUE_VOICE_TIMEOUT_PACKET_SUBSEQUENT;
this->prefs.bitrate_minimum = JSON_INT_VALUE_VOICE_BITRATE_MINIMUM;
this->prefs.time_threshold = JSON_INT_VALUE_VOICE_TIME_THRESHOLD;
this->prefs.utterance_save = ctrlm_is_production_build() ? JSON_ARRAY_VAL_BOOL_VOICE_SAVE_LAST_UTTERANCE_0 : JSON_ARRAY_VAL_BOOL_VOICE_SAVE_LAST_UTTERANCE_1;
this->prefs.utterance_use_curtail = JSON_BOOL_VALUE_VOICE_UTTERANCE_USE_CURTAIL;
this->prefs.utterance_file_qty_max = JSON_INT_VALUE_VOICE_UTTERANCE_FILE_QTY_MAX;
this->prefs.utterance_file_size_max = JSON_INT_VALUE_VOICE_UTTERANCE_FILE_SIZE_MAX;
this->prefs.utterance_path = JSON_STR_VALUE_VOICE_UTTERANCE_PATH;
this->prefs.utterance_duration_min = JSON_INT_VALUE_VOICE_MINIMUM_DURATION;
this->prefs.ffv_leading_samples = JSON_INT_VALUE_VOICE_FFV_LEADING_SAMPLES;
this->prefs.force_voice_settings = JSON_BOOL_VALUE_VOICE_FORCE_VOICE_SETTINGS;
this->prefs.vrex_test_flag = JSON_BOOL_VALUE_VOICE_VREX_TEST_FLAG;
this->prefs.vrex_wuw_bypass_success_flag = JSON_BOOL_VALUE_VOICE_VREX_WUW_BYPASS_SUCCESS_FLAG;
this->prefs.vrex_wuw_bypass_failure_flag = JSON_BOOL_VALUE_VOICE_VREX_WUW_BYPASS_FAILURE_FLAG;
this->prefs.force_toggle_fallback = JSON_BOOL_VALUE_VOICE_FORCE_TOGGLE_FALLBACK;
this->prefs.telemetry_session_stats = JSON_BOOL_VALUE_VOICE_TELEMETRY_SESSION_STATS;
this->prefs.par_voice_enabled = false;
this->prefs.par_voice_eos_method = JSON_INT_VALUE_VOICE_PAR_VOICE_EOS_METHOD;
this->prefs.par_voice_eos_timeout = JSON_INT_VALUE_VOICE_PAR_VOICE_EOS_TIMEOUT;
this->prefs.aspect_ratio = JSON_STR_VALUE_VOICE_ASPECT_RATIO;
this->prefs.guide_language = JSON_STR_VALUE_VOICE_LANGUAGE;
this->prefs.app_id_http = JSON_STR_VALUE_VOICE_APP_ID_HTTP;
this->prefs.app_id_ws = JSON_STR_VALUE_VOICE_APP_ID_WS;
this->prefs.timeout_vrex_connect = JSON_INT_VALUE_VOICE_VREX_REQUEST_TIMEOUT;
this->prefs.timeout_vrex_session = JSON_INT_VALUE_VOICE_VREX_RESPONSE_TIMEOUT;
this->prefs.timeout_stats = JSON_INT_VALUE_VOICE_TIMEOUT_STATS;
this->prefs.timeout_packet_initial = JSON_INT_VALUE_VOICE_TIMEOUT_PACKET_INITIAL;
this->prefs.timeout_packet_subsequent = JSON_INT_VALUE_VOICE_TIMEOUT_PACKET_SUBSEQUENT;
this->prefs.bitrate_minimum = JSON_INT_VALUE_VOICE_BITRATE_MINIMUM;
this->prefs.time_threshold = JSON_INT_VALUE_VOICE_TIME_THRESHOLD;
this->prefs.utterance_save = ctrlm_is_production_build() ? JSON_ARRAY_VAL_BOOL_VOICE_SAVE_LAST_UTTERANCE_0 : JSON_ARRAY_VAL_BOOL_VOICE_SAVE_LAST_UTTERANCE_1;
this->prefs.utterance_use_curtail = JSON_BOOL_VALUE_VOICE_UTTERANCE_USE_CURTAIL;
this->prefs.utterance_file_qty_max = JSON_INT_VALUE_VOICE_UTTERANCE_FILE_QTY_MAX;
this->prefs.utterance_file_size_max = JSON_INT_VALUE_VOICE_UTTERANCE_FILE_SIZE_MAX;
this->prefs.utterance_path = JSON_STR_VALUE_VOICE_UTTERANCE_PATH;
this->prefs.utterance_duration_min = JSON_INT_VALUE_VOICE_MINIMUM_DURATION;
this->prefs.ffv_leading_samples = JSON_INT_VALUE_VOICE_FFV_LEADING_SAMPLES;
this->prefs.voice_activity_detection_mode = this->voice_activity_detection_mode_to_xrsr(JSON_STR_VALUE_VOICE_VOICE_ACTIVITY_DETECTION_MODE);
this->prefs.force_voice_settings = JSON_BOOL_VALUE_VOICE_FORCE_VOICE_SETTINGS;
this->prefs.vrex_test_flag = JSON_BOOL_VALUE_VOICE_VREX_TEST_FLAG;
this->prefs.vrex_wuw_bypass_success_flag = JSON_BOOL_VALUE_VOICE_VREX_WUW_BYPASS_SUCCESS_FLAG;
this->prefs.vrex_wuw_bypass_failure_flag = JSON_BOOL_VALUE_VOICE_VREX_WUW_BYPASS_FAILURE_FLAG;
this->prefs.force_toggle_fallback = JSON_BOOL_VALUE_VOICE_FORCE_TOGGLE_FALLBACK;
this->prefs.telemetry_session_stats = JSON_BOOL_VALUE_VOICE_TELEMETRY_SESSION_STATS;
this->prefs.par_voice_enabled = false;
this->prefs.par_voice_eos_method = JSON_INT_VALUE_VOICE_PAR_VOICE_EOS_METHOD;
this->prefs.par_voice_eos_timeout = JSON_INT_VALUE_VOICE_PAR_VOICE_EOS_TIMEOUT;
this->voice_params_opus_encoder_default();
this->xrsr_opened = false;
this->voice_ipc = NULL;
this->packet_loss_threshold = JSON_INT_VALUE_VOICE_PACKET_LOSS_THRESHOLD;
this->vsdk_config = NULL;
this->nsm_voice_session = false;
this->xrsr_opened = false;
this->voice_ipc = NULL;
this->packet_loss_threshold = JSON_INT_VALUE_VOICE_PACKET_LOSS_THRESHOLD;
this->vsdk_config = NULL;
this->nsm_voice_session = false;

#ifndef TELEMETRY_SUPPORT
XLOGD_WARN("telemetry is not enabled");
Expand Down Expand Up @@ -261,6 +262,7 @@ ctrlm_voice_t::ctrlm_voice_t() {
this->secure_url_required = JSON_BOOL_VALUE_VOICE_REQUIRE_SECURE_URL;

XLOGD_TELEMETRY("require i_SAT <%s> i_MTLS <%s> i_secure_url <%s>", this->sat_token_required ? "YES" : "NO", this->mtls_required ? "YES" : "NO", this->secure_url_required ? "YES" : "NO");
XLOGD_INFO("voice activity detection mode <%s>", xrsr_stream_voice_activity_mode_str(this->prefs.voice_activity_detection_mode));

errno_t safec_rc = memset_s(this->sat_token, sizeof(this->sat_token), 0, sizeof(this->sat_token));
ERR_CHK(safec_rc);
Expand Down Expand Up @@ -425,6 +427,13 @@ bool ctrlm_voice_t::voice_configure_config_file_json(json_t *obj_voice, json_t *
conf.config_value_get(JSON_STR_NAME_VOICE_URL_SRC_MIC_TAP, this->prefs.server_url_src_mic_tap);
conf.config_value_get(JSON_STR_NAME_VOICE_LANGUAGE, this->prefs.guide_language);
conf.config_value_get(JSON_INT_NAME_VOICE_MINIMUM_DURATION, this->prefs.utterance_duration_min);

std::string voice_activity_detection_mode;
if(conf.config_value_get(JSON_STR_NAME_VOICE_VOICE_ACTIVITY_DETECTION_MODE, voice_activity_detection_mode)) {
this->prefs.voice_activity_detection_mode = this->voice_activity_detection_mode_to_xrsr(voice_activity_detection_mode);
XLOGD_INFO("voice activity detection mode <%s>", xrsr_stream_voice_activity_mode_str(this->prefs.voice_activity_detection_mode));
}

if(conf.config_value_get(JSON_BOOL_NAME_VOICE_ENABLE_SAT, this->sat_token_required)) {
ctrlm_sm_voice_sat_enable_write(this->sat_token_required);
XLOGD_TELEMETRY("require c_SAT <%s>", this->sat_token_required ? "YES" : "NO");
Expand Down Expand Up @@ -1089,6 +1098,17 @@ void ctrlm_voice_t::voice_params_opus_encoder_default(void) {
this->voice_params_opus_samples_per_packet_set();
}

xrsr_stream_voice_activity_mode_t ctrlm_voice_t::voice_activity_detection_mode_to_xrsr(const std::string &mode) {
// Configure voice activity detection parameters based on mode
if(mode == "enabled") { // Voice activity detection will be used but not enforced
return(XRSR_STREAM_VOICE_ACTIVITY_MODE_ENABLED);
} else if(mode == "enforced") { // Voice session will only proceed if voice activity is detected
return(XRSR_STREAM_VOICE_ACTIVITY_MODE_ENFORCED);
}
// Voice activity detection is disabled (or invalid mode)
return(XRSR_STREAM_VOICE_ACTIVITY_MODE_DISABLED);
}

void ctrlm_voice_t::voice_params_opus_samples_per_packet_set(void) {
guchar fr_dur = (this->prefs.opus_encoder_params[3] >> 4) & 0xF;
switch(fr_dur) {
Expand Down Expand Up @@ -2645,6 +2665,12 @@ void ctrlm_voice_t::voice_session_end_callback(ctrlm_voice_session_end_cb_t *ses
end.result = SESSION_END_SHORT_UTTERANCE;
end.reason = (int)session->end_reason_rcu;
this->voice_ipc->session_end(end);
} else if(stats->session_end_reason == XRSR_SESSION_END_REASON_ERROR_AUDIO_SILENT) {
ctrlm_voice_ipc_event_session_end_t end;
end.common = session->ipc_common_data;
end.result = SESSION_END_SILENT_UTTERANCE;
end.reason = (int)session->end_reason_rcu;
Comment thread
dwolaver marked this conversation as resolved.
this->voice_ipc->session_end(end);
} else {
ctrlm_voice_ipc_event_session_end_server_stats_t server_stats;
ctrlm_voice_ipc_event_session_end_t end;
Expand Down Expand Up @@ -2931,15 +2957,24 @@ void ctrlm_voice_t::voice_stream_end_callback(ctrlm_voice_stream_end_cb_t *strea
#ifdef TELEMETRY_SUPPORT
if(this->prefs.telemetry_session_stats) {
uint32_t packets_total = session->packets_lost + session->packets_processed;
session->telemetry_session_stats.update_on_stream_end(stream_duration, packets_total, session->packets_lost, packets_total * samples_per_packet, session->packets_lost * samples_per_packet, decoder_failures, 0);
int32_t voice_detected = -1;
uint32_t peak_confidence = 0;
int32_t peak_rms_level = 0;
if(stats->audio_stats.vad_frames_processed > 0) {
voice_detected = (stats->audio_stats.vad_voice_detected) ? 1 : 0;
peak_confidence = (stats->audio_stats.vad_confidence_peak * 100);
peak_rms_level = stats->audio_stats.vad_rms_level_peak;
}

session->telemetry_session_stats.update_on_stream_end(stream_duration, packets_total, session->packets_lost, packets_total * samples_per_packet, session->packets_lost * samples_per_packet, decoder_failures, 0, voice_detected, peak_confidence, peak_rms_level);
}
#endif
} else if(samples_processed > 0) {
uint32_t stream_duration = (samples_processed / 16); // 16 kHz samples to ms
XLOGD_INFO("src <%s> Samples Lost/Total <%u/%u> %.02f%% buffered max <%u> duration <%u> ms", ctrlm_voice_device_str(session->voice_device), samples_lost, samples_lost + samples_processed, 100.0 * ((double)samples_lost / (double)(samples_lost + samples_processed)), samples_buffered_max, stream_duration);
#ifdef TELEMETRY_SUPPORT
if(this->prefs.telemetry_session_stats) {
session->telemetry_session_stats.update_on_stream_end(stream_duration, 0, 0, samples_lost + samples_processed, samples_lost, decoder_failures, samples_buffered_max);
session->telemetry_session_stats.update_on_stream_end(stream_duration, 0, 0, samples_lost + samples_processed, samples_lost, decoder_failures, samples_buffered_max, -1, 0, 0);
}
#endif
}
Expand Down Expand Up @@ -4052,7 +4087,6 @@ void ctrlm_voice_t::voice_rfc_retrieved_handler(const ctrlm_rfc_attr_t& attr) {
attr.get_rfc_value(JSON_INT_NAME_VOICE_DST_PARAMS_STANDBY_TIMEOUT_SESSION, this->prefs.dst_params_standby.timeout_session) |
attr.get_rfc_value(JSON_BOOL_NAME_VOICE_DST_PARAMS_STANDBY_IPV4_FALLBACK, this->prefs.dst_params_standby.ipv4_fallback) |
attr.get_rfc_value(JSON_INT_NAME_VOICE_DST_PARAMS_STANDBY_BACKOFF_DELAY, this->prefs.dst_params_standby.backoff_delay) |

attr.get_rfc_value(JSON_INT_NAME_VOICE_DST_PARAMS_LOW_LATENCY_CONNECT_CHECK_INTERVAL, this->prefs.dst_params_low_latency.connect_check_interval) |
attr.get_rfc_value(JSON_INT_NAME_VOICE_DST_PARAMS_LOW_LATENCY_TIMEOUT_CONNECT, this->prefs.dst_params_low_latency.timeout_connect) |
attr.get_rfc_value(JSON_INT_NAME_VOICE_DST_PARAMS_LOW_LATENCY_TIMEOUT_INACTIVITY, this->prefs.dst_params_low_latency.timeout_inactivity) |
Expand All @@ -4062,6 +4096,13 @@ void ctrlm_voice_t::voice_rfc_retrieved_handler(const ctrlm_rfc_attr_t& attr) {
reroute = true;
}

std::string voice_activity_detection_mode;
if(attr.get_rfc_value(JSON_STR_NAME_VOICE_VOICE_ACTIVITY_DETECTION_MODE, voice_activity_detection_mode)) {
this->prefs.voice_activity_detection_mode = this->voice_activity_detection_mode_to_xrsr(voice_activity_detection_mode);
XLOGD_INFO("voice activity detection mode <%s>", xrsr_stream_voice_activity_mode_str(this->prefs.voice_activity_detection_mode));
reroute = true;
}

std::vector<std::string> obj_server_hosts;
if(attr.get_rfc_value(JSON_ARRAY_NAME_VOICE_SERVER_HOSTS, obj_server_hosts)) {
this->url_hostname_patterns(obj_server_hosts);
Expand Down
Loading
Loading