/* * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. */ #include #include #include #include #include #define SYNC_BUFFER_SIZE 4096 #define SPIN_QUEUE_SIZE 2 #define SPIN_FRAME_SIZE 255 #include "espeak.h" typedef enum { IN = 1, OUT = 2, PLAY = 4 } SpinState; typedef enum { INPROCESS = 1, CLOSE = 2 } ContextState; typedef struct { Econtext *context; volatile SpinState state; GByteArray *sound; gsize sound_offset; GstClockTime audio_position; GArray *events; gsize events_pos; int last_word; int mark_offset; const gchar *mark_name; } Espin; struct _Econtext { volatile ContextState state; gchar *text; gsize text_offset; gsize text_len; gchar *next_mark; Espin queue[SPIN_QUEUE_SIZE]; Espin *in; Espin *out; GSList *process_chunk; volatile gint rate; volatile gint pitch; volatile const gchar *voice; volatile gint gap; volatile gint track; GstElement *emitter; GstBus *bus; }; static inline void spinning (Espin * base, Espin ** i) { if (++(*i) == base + SPIN_QUEUE_SIZE) *i = base; } static void post_message (Econtext * self, GstStructure * data) { if (!self->bus) self->bus = gst_element_get_bus (self->emitter); GstMessage *msg = gst_message_new_element (GST_OBJECT (self->emitter), data); gst_bus_post (self->bus, msg); } static void emit_word (Econtext * self, guint offset, guint len, guint id) { post_message (self, gst_structure_new ("espeak-word", "offset", G_TYPE_UINT, offset, "len", G_TYPE_UINT, len, "id", G_TYPE_UINT, id, NULL)); } static void emit_sentence (Econtext * self, guint offset, guint len, guint id) { post_message (self, gst_structure_new ("espeak-sentence", "offset", G_TYPE_UINT, offset, "len", G_TYPE_UINT, len, "id", G_TYPE_UINT, id, NULL)); } static void emit_mark (Econtext * self, guint offset, const gchar * mark) { post_message (self, gst_structure_new ("espeak-mark", "offset", G_TYPE_UINT, offset, "mark", G_TYPE_STRING, mark, NULL)); } static void init (); static void process_push (Econtext *, gboolean); static void process_pop (Econtext *); static GThread *process_tid = NULL; static GMutex *process_lock = NULL; static GCond *process_cond = NULL; static GSList *process_queue = NULL; static gint espeak_sample_rate = 0; static GValueArray *espeak_voices = NULL; // ----------------------------------------------------------------------------- Econtext *espeak_new (GstElement * emitter) { init (); Econtext *self = g_new0 (Econtext, 1); gint i; for (i = SPIN_QUEUE_SIZE; i--;) { Espin *spin = &self->queue[i]; spin->context = self; spin->state = IN; spin->sound = g_byte_array_new (); spin->events = g_array_new (FALSE, FALSE, sizeof (espeak_EVENT)); } self->in = self->queue; self->out = self->queue; self->process_chunk = g_slist_alloc (); self->process_chunk->data = self; self->pitch = 50; self->rate = 170; self->voice = ESPEAK_DEFAULT_VOICE; self->gap = 0; self->track = ESPEAK_TRACK_NONE; self->emitter = emitter; gst_object_ref (self->emitter); self->bus = NULL; GST_DEBUG ("[%p]", self); return self; } void espeak_unref (Econtext * self) { GST_DEBUG ("[%p]", self); espeak_reset (self); gint i; for (i = SPIN_QUEUE_SIZE; i--;) { g_byte_array_free (self->queue[i].sound, TRUE); g_array_free (self->queue[i].events, TRUE); } g_slist_free (self->process_chunk); gst_object_unref (self->bus); gst_object_unref (self->emitter); memset (self, 0, sizeof (Econtext)); g_free (self); } // in/out ---------------------------------------------------------------------- void espeak_in (Econtext * self, const gchar * text) { GST_DEBUG ("[%p] text=%s", self, text); if (text == NULL || *text == 0) return; self->text = g_strdup (text); self->text_offset = 0; self->text_len = strlen (text); process_push (self, TRUE); } GstBuffer *play (Econtext * self, Espin * spin, gsize size_to_play) { inline gsize whole (Espin * spin, gsize size_to_play) { for (;; ++spin->events_pos) { espeak_EVENT *i = &g_array_index (spin->events, espeak_EVENT, spin->events_pos); gsize len = i->sample * 2 - spin->sound_offset; if (i->type == espeakEVENT_LIST_TERMINATED || len >= size_to_play) return len; } } inline gsize events (Econtext * self, Espin * spin, gsize size_to_play) { gsize spin_size = spin->sound->len; gsize event; gsize sample_offset = 0; espeak_EVENT *i = &g_array_index (spin->events, espeak_EVENT, spin->events_pos); GST_DEBUG ("event=%zd i->type=%d i->text_position=%d", event, i->type, i->text_position); if (i->type == espeakEVENT_LIST_TERMINATED) { sample_offset = spin_size; } else { switch (i->type) { case espeakEVENT_MARK: emit_mark (self, i->text_position, i->id.name); break; case espeakEVENT_WORD: emit_word (self, i->text_position, i->length, i->id.number); break; case espeakEVENT_SENTENCE: emit_sentence (self, i->text_position, i->length, i->id.number); break; } } if (!sample_offset) { sample_offset = i->sample * 2; } return sample_offset - spin->sound_offset; } g_atomic_int_set (&spin->state, PLAY); switch (g_atomic_int_get (&self->track)) { case ESPEAK_TRACK_WORD: case ESPEAK_TRACK_MARK: size_to_play = events (self, spin, size_to_play); break; default: size_to_play = whole (spin, size_to_play); break; } espeak_EVENT *event = &g_array_index (spin->events, espeak_EVENT, spin->events_pos); GstBuffer *out = gst_buffer_new_wrapped_full (GST_MEMORY_FLAG_READONLY | GST_MEMORY_FLAG_NO_SHARE, spin->sound->data, spin->sound->len, spin->sound_offset, size_to_play, NULL, NULL); GST_BUFFER_OFFSET (out) = spin->sound_offset; GST_BUFFER_TIMESTAMP (out) = spin->audio_position; spin->audio_position = gst_util_uint64_scale_int (event->audio_position, GST_SECOND, 1000); GST_BUFFER_DURATION (out) = spin->audio_position - GST_BUFFER_TIMESTAMP (out); spin->sound_offset += size_to_play; spin->events_pos += 1; GST_DEBUG ("size_to_play=%zd tell=%zd ts=%" G_GUINT64_FORMAT " dur=%" G_GUINT64_FORMAT, size_to_play, spin->sound_offset, GST_BUFFER_TIMESTAMP (out), GST_BUFFER_DURATION (out)); return out; } GstBuffer *espeak_out (Econtext * self, gsize size_to_play) { GST_DEBUG ("[%p] size_to_play=%d", self, size_to_play); for (;;) { g_mutex_lock (process_lock); for (;;) { if (g_atomic_int_get (&self->out->state) & (PLAY | OUT)) break; if (self->state != INPROCESS) { if (self->state == CLOSE) GST_DEBUG ("[%p] sesseion is closed", self); else GST_DEBUG ("[%p] nothing to play", self); g_mutex_unlock (process_lock); return NULL; } GST_DEBUG ("[%p] wait for processed data", self); g_cond_wait (process_cond, process_lock); } g_mutex_unlock (process_lock); Espin *spin = self->out; gsize spin_size = spin->sound->len; GST_DEBUG ("[%p] spin=%p spin->sound_offset=%zd spin_size=%zd " "spin->state=%d", self, spin, spin->sound_offset, spin_size, g_atomic_int_get (&spin->state)); if (g_atomic_int_get (&spin->state) == PLAY && spin->sound_offset >= spin_size) { g_atomic_int_set (&spin->state, IN); process_push (self, FALSE); spinning (self->queue, &self->out); continue; } return play (self, spin, size_to_play); } GST_DEBUG ("[%p]", self); return NULL; } void espeak_reset (Econtext * self) { process_pop (self); GstBuffer *buf; while ((buf = espeak_out (self, SYNC_BUFFER_SIZE)) != NULL) gst_buffer_unref (buf); int i; for (i = SPIN_QUEUE_SIZE; i--;) g_atomic_int_set (&self->queue[i].state, IN); if (self->text) { g_free (self->text); self->text = NULL; } self->next_mark = NULL; } // espeak ---------------------------------------------------------------------- static gint synth_cb (short *data, int numsamples, espeak_EVENT * events) { if (data == NULL) return 0; Espin *spin = events->user_data; Econtext *self = spin->context; if (numsamples > 0) { g_byte_array_append (spin->sound, (const guint8 *) data, numsamples * 2); espeak_EVENT *i; for (i = events; i->type != espeakEVENT_LIST_TERMINATED; ++i) { GST_DEBUG ("type=%d text_position=%d length=%d " "audio_position=%d sample=%d", i->type, i->text_position, i->length, i->audio_position, i->sample * 2); // convert to 0-based position --i->text_position; if (i->type == espeakEVENT_MARK) { // point mark name to our text substring instead of // one which was temporally allocated by espeak if (self->next_mark == NULL) self->next_mark = self->text; int mark_len = strlen (i->id.name); strncpy (self->next_mark, i->id.name, mark_len); i->id.name = self->next_mark; self->next_mark[mark_len] = '\0'; self->next_mark += mark_len + 1; } GST_DEBUG ("text_position=%d length=%d", i->text_position, i->length); g_array_append_val (spin->events, *i); } } GST_DEBUG ("numsamples=%d", numsamples * 2); return 0; } static void synth (Econtext * self, Espin * spin) { g_byte_array_set_size (spin->sound, 0); g_array_set_size (spin->events, 0); spin->sound_offset = 0; spin->audio_position = 0; spin->events_pos = 0; spin->mark_offset = 0; spin->mark_name = NULL; spin->last_word = -1; espeak_SetParameter (espeakPITCH, g_atomic_int_get (&self->pitch), 0); espeak_SetParameter (espeakRATE, g_atomic_int_get (&self->rate), 0); espeak_SetVoiceByName ((gchar *) g_atomic_pointer_get (&self->voice)); espeak_SetParameter (espeakWORDGAP, g_atomic_int_get (&self->gap), 0); gint track = g_atomic_int_get (&self->track); gint flags = espeakCHARS_UTF8; if (track == ESPEAK_TRACK_MARK) flags |= espeakSSML; GST_DEBUG ("[%p] text_offset=%zd", self, self->text_offset); espeak_Synth (self->text, self->text_len + 1, 0, POS_CHARACTER, 0, flags, NULL, spin); if (spin->events->len) { int text_offset = g_array_index (spin->events, espeak_EVENT, spin->events->len - 1).text_position + 1; self->text_offset = g_utf8_offset_to_pointer (self->text, text_offset) - self->text; } espeak_EVENT last_event = { espeakEVENT_LIST_TERMINATED }; last_event.sample = spin->sound->len / 2; g_array_append_val (spin->events, last_event); } gint espeak_get_sample_rate () { return espeak_sample_rate; } GValueArray *espeak_get_voices () { init (); return g_value_array_copy (espeak_voices); } void espeak_set_pitch (Econtext * self, gint value) { if (value == 0) value = 50; else value = MIN (99, (value + 100) / 2); g_atomic_int_set (&self->pitch, value); } void espeak_set_rate (Econtext * self, gint value) { if (value == 0) value = 170; else if (value < 0) value = MAX (80, value + 170); else value = 170 + value * 2; g_atomic_int_set (&self->rate, value); } void espeak_set_voice (Econtext * self, const gchar * value) { g_atomic_pointer_set (&self->voice, value); } void espeak_set_gap (Econtext * self, guint value) { g_atomic_int_set (&self->gap, value); } void espeak_set_track (Econtext * self, guint value) { g_atomic_int_set (&self->track, value); } // process ---------------------------------------------------------------------- static gpointer process (gpointer data) { g_mutex_lock (process_lock); for (;;) { while (process_queue == NULL) g_cond_wait (process_cond, process_lock); while (process_queue) { Econtext *context = (Econtext *) process_queue->data; Espin *spin = context->in; process_queue = g_slist_remove_link (process_queue, process_queue); if (context->state == CLOSE) { GST_DEBUG ("[%p] session is closed", context); continue; } GST_DEBUG ("[%p] context->text_offset=%d context->text_len=%d", context, context->text_offset, context->text_len); if (context->text_offset >= context->text_len) { GST_DEBUG ("[%p] end of text to process", context); context->state &= ~INPROCESS; } else { synth (context, spin); g_atomic_int_set (&spin->state, OUT); spinning (context->queue, &context->in); if (g_atomic_int_get (&context->in->state) == IN) { GST_DEBUG ("[%p] continue to process data", context); process_queue = g_slist_concat (process_queue, context->process_chunk); } else { GST_DEBUG ("[%p] pause to process data", context); context->state &= ~INPROCESS; } } } g_cond_broadcast (process_cond); } g_mutex_unlock (process_lock); return NULL; } static void process_push (Econtext * context, gboolean force_in) { GST_DEBUG ("[%p] lock", context); g_mutex_lock (process_lock); if (context->state == CLOSE && !force_in) GST_DEBUG ("[%p] state=%d", context, context->state); else if (context->state != INPROCESS) { context->state = INPROCESS; process_queue = g_slist_concat (process_queue, context->process_chunk); g_cond_broadcast (process_cond); } g_mutex_unlock (process_lock); GST_DEBUG ("[%p] unlock", context); } static void process_pop (Econtext * context) { GST_DEBUG ("[%p] lock", context); g_mutex_lock (process_lock); process_queue = g_slist_remove_link (process_queue, context->process_chunk); context->state = CLOSE; g_cond_broadcast (process_cond); g_mutex_unlock (process_lock); GST_DEBUG ("[%p] unlock", context); } // ----------------------------------------------------------------------------- static void init () { static volatile gsize initialized = 0; if (initialized == 0) { ++initialized; process_lock = g_mutex_new (); process_cond = g_cond_new (); process_tid = g_thread_create (process, NULL, FALSE, NULL); espeak_sample_rate = espeak_Initialize (AUDIO_OUTPUT_SYNCHRONOUS, SYNC_BUFFER_SIZE, NULL, 0); espeak_SetSynthCallback (synth_cb); gsize count = 0; const espeak_VOICE **i; const espeak_VOICE **voices = espeak_ListVoices (NULL); for (i = voices; *i; ++i) ++count; espeak_voices = g_value_array_new (count); for (i = voices; *i; ++i) { GValueArray *voice = g_value_array_new (2); GValue name = { 0 }; g_value_init (&name, G_TYPE_STRING); g_value_set_static_string (&name, (*i)->name); g_value_array_append (voice, &name); char *dialect_str = strchr ((*i)->languages + 1, '-'); if (dialect_str) *dialect_str++ = 0; GValue lang = { 0 }; g_value_init (&lang, G_TYPE_STRING); g_value_set_static_string (&lang, (*i)->languages + 1); g_value_array_append (voice, &lang); GValue dialect = { 0 }; g_value_init (&dialect, G_TYPE_STRING); g_value_set_static_string (&dialect, dialect_str ? dialect_str : "none"); g_value_array_append (voice, &dialect); GValue voice_value = { 0 }; g_value_init (&voice_value, G_TYPE_VALUE_ARRAY); g_value_take_boxed (&voice_value, voice); g_value_array_append (espeak_voices, &voice_value); g_value_unset (&voice_value); } } }