/*
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */

#include <stdlib.h>
#include <string.h>
#include <glib.h>
#include <gio/gio.h>
#include <gst/gst.h>
#include <espeak/speak_lib.h>

#define SYNC_BUFFER_SIZE 4096

#define SPIN_QUEUE_SIZE 2
#define SPIN_FRAME_SIZE 255

#include "espeak.h"

typedef enum
{
    IN      = 1,
    OUT     = 2,
    PLAY    = 4
} SpinState;

typedef enum
{
    INPROCESS = 1
} ContextState;

typedef struct
{
    Econtext *context;

    volatile SpinState state;

    GMemoryOutputStream *sound;
    goffset sound_offset;

    GArray *events;
    goffset events_pos;

    goffset last_word;
    goffset mark_offset;
    const gchar *mark_name;
} Espin;

struct _Econtext
{
    volatile ContextState state;

    gchar *text;
    goffset text_offset;
    gsize text_len;

    Espin queue[SPIN_QUEUE_SIZE];
    Espin *in;
    Espin *out;

    GSList *process_chunk;

    volatile gint rate;
    volatile gint pitch;
    volatile const gchar *voice;
    volatile gint gap;
    volatile gint track;

    GstElement *emitter;
    GstBus *bus;
};

static inline void
spinning(Espin *base, Espin **i)
{
    if (++(*i) == base + SPIN_QUEUE_SIZE)
        *i = base;
}

static void
emit_word(Econtext *self, guint offset, guint len)
{
    GstStructure *data = gst_structure_new("espeak-word",
            "offset", G_TYPE_UINT, offset,
            "len", G_TYPE_UINT, len,
            NULL);
    if (!self->bus)
        self->bus = gst_element_get_bus(self->emitter);
    GstMessage *msg = gst_message_new_element(GST_OBJECT(self->emitter), data);
    gst_bus_post(self->bus, msg);
}

static void
emit_mark(Econtext *self, guint offset, const gchar *mark)
{
    GstStructure *data = gst_structure_new("espeak-mark",
            "offset", G_TYPE_UINT, offset,
            "mark", G_TYPE_STRING, mark,
            NULL);
    if (!self->bus)
        self->bus = gst_element_get_bus(self->emitter);
    GstMessage *msg = gst_message_new_element(GST_OBJECT(self->emitter), data);
    gst_bus_post(self->bus, msg);
}

static void init();
static void process_push(Econtext*);
static void process_pop(Econtext*);

static GThread *process_tid = NULL;
static GMutex *process_lock = NULL;
static GCond  *process_cond = NULL;
static GSList *process_queue = NULL;

static gint espeak_sample_rate = 0;
static GValueArray *espeak_voices = NULL;

// -----------------------------------------------------------------------------

Econtext*
espeak_new(GstElement *emitter)
{
    init();

    Econtext *self = g_new0(Econtext, 1);
    gint i;

    for (i = SPIN_QUEUE_SIZE; i--;)
    {
        Espin *spin = &self->queue[i];

        spin->context = self;
        spin->state = IN;
        spin->sound = G_MEMORY_OUTPUT_STREAM(g_memory_output_stream_new(
                    NULL, 0, realloc, free));
        spin->events = g_array_new(FALSE, FALSE, sizeof(espeak_EVENT));
    }

    self->in = self->queue;
    self->out = self->queue;

    self->process_chunk = g_slist_alloc();
    self->process_chunk->data = self;

    self->pitch = 50;
    self->rate = 170;
    self->voice = ESPEAK_DEFAULT_VOICE;
    self->gap = 0;
    self->track = ESPEAK_TRACK_NONE;

    self->emitter = emitter;
    gst_object_ref(self->emitter);
    self->bus = NULL;

    GST_DEBUG("[%p]", self);

    return self;
}

void
espeak_unref(Econtext *self)
{
    GST_DEBUG("[%p]", self);

    espeak_reset(self);

    gint i;

    for (i = SPIN_QUEUE_SIZE; i--;)
    {
        g_output_stream_close(G_OUTPUT_STREAM(self->queue[i].sound),
                NULL, NULL);
        g_object_unref(self->queue[i].sound);
        g_array_free(self->queue[i].events, TRUE);
    }

    g_slist_free(self->process_chunk);

    gst_object_unref(self->bus);
    gst_object_unref(self->emitter);

    memset(self, 0, sizeof(Econtext));
    g_free(self);
}

// in/out ----------------------------------------------------------------------

void
espeak_in(Econtext *self, const gchar *text)
{
    GST_DEBUG("[%p] text=%s", self, text);

    if (text == NULL || *text == 0)
        return;

    self->text = g_strdup(text);
    self->text_offset = 0;
    self->text_len = strlen(text);

    process_push(self);
}

GstBuffer*
play(Econtext *self, Espin *spin, gsize size_to_play)
{
    inline gsize whole(Espin *spin, gsize size_to_play)
    {
        gsize spin_size = g_memory_output_stream_get_data_size(spin->sound);
        return MIN(size_to_play, spin_size - spin->sound_offset);
    }

    inline gsize word(Econtext *self, Espin *spin, gsize size_to_play)
    {
        gsize spin_size = g_memory_output_stream_get_data_size(spin->sound);
        size_to_play = MIN(size_to_play, spin_size);

        goffset event;
        goffset sample_offset = 0;
        goffset text_offset = -1;
        gsize text_len = 0;

        for (event = spin->events_pos; TRUE; ++event)
        {
            espeak_EVENT *i = &g_array_index(spin->events, espeak_EVENT, event);

            GST_DEBUG("size_to_play=%ld event=%ld "
                      "i->type=%d i->text_position=%d",
                      size_to_play, event, i->type, i->text_position);

            if (i->type == espeakEVENT_LIST_TERMINATED)
            {
                GST_DEBUG("i->sample=%d", i->sample*2);
                sample_offset = spin_size;
                break;
            }
            else if (i->type == espeakEVENT_WORD)
            {
                sample_offset = i[1].sample*2;
                text_offset = i->text_position;
                text_len = i->length;

                GST_DEBUG("sample_offset=%d txt_offset=%d txt_len=%d, txt=%s",
                        sample_offset, text_offset, text_len,
                        self->text + text_offset);
                break;
            }
        }

        if (text_offset != -1 && text_offset > spin->last_word)
        {
            spin->last_word = text_offset + text_len;
            emit_word(self, text_offset, text_len);
        }

        if (sample_offset - spin->sound_offset > size_to_play)
        {
            GST_DEBUG("sample_offset=%ld spin->sound_offset=%ld",
                    sample_offset, spin->sound_offset);
            return size_to_play;
        }

        if (text_offset != -1)
            spin->events_pos = event + 1;

        return sample_offset - spin->sound_offset;
    }

    inline gsize mark(Econtext *self, Espin *spin, gsize size_to_play)
    {
        if (spin->mark_name)
        {
            emit_mark(self, spin->mark_offset, spin->mark_name);
            spin->mark_offset = -1;
            spin->mark_name = NULL;
        }

        gsize spin_size = g_memory_output_stream_get_data_size(spin->sound);
        size_to_play = MIN(size_to_play, spin_size);

        goffset event;
        goffset sample_offset = 0;
        guint mark_offset = 0;
        const gchar *mark_name = NULL;

        for (event = spin->events_pos; TRUE; ++event)
        {
            espeak_EVENT *i = &g_array_index(spin->events, espeak_EVENT, event);

            GST_DEBUG("size_to_play=%ld event=%ld "
                      "i->type=%d i->text_position=%d",
                      size_to_play, event, i->type, i->text_position);

            if (i->type == espeakEVENT_LIST_TERMINATED)
            {
                sample_offset = spin_size;
                break;
            }
            else if (i->type == espeakEVENT_MARK)
            {
                if (i->sample == 0)
                {
                    if (spin->sound_offset == 0)
                        emit_mark(self, i->text_position, i->id.name);
                    continue;
                }

                mark_offset = i->text_position;
                mark_name = i->id.name;
                sample_offset = i->sample*2;
                break;
            }
        }

        if (sample_offset - spin->sound_offset > size_to_play)
        {
            GST_DEBUG("sample_offset=%ld spin->sound_offset=%ld",
                    sample_offset, spin->sound_offset);
            return size_to_play;
        }

        spin->mark_offset = mark_offset;
        spin->mark_name = mark_name;
        spin->events_pos = event + 1;

        return sample_offset - spin->sound_offset;
    }

    g_atomic_int_set(&spin->state, PLAY);

    switch (g_atomic_int_get(&self->track))
    {
        case ESPEAK_TRACK_WORD:
            size_to_play = word(self, spin, size_to_play);
            break;
        case ESPEAK_TRACK_MARK:
            size_to_play = mark(self, spin, size_to_play);
            break;
        default:
            size_to_play = whole(spin, size_to_play);
    }

    GstBuffer *out = gst_buffer_new();
    GST_BUFFER_DATA(out) =
            (guchar*)g_memory_output_stream_get_data(spin->sound) +
                spin->sound_offset;
    GST_BUFFER_SIZE(out) = size_to_play;

    spin->sound_offset += size_to_play;

    GST_DEBUG("out=%p size_to_play=%ld tell=%ld", GST_BUFFER_DATA(out),
            size_to_play, spin->sound_offset);

    return out;
}

GstBuffer*
espeak_out(Econtext *self, gsize size_to_play)
{
    GST_DEBUG("[%p] size_to_play=%d", self, size_to_play);

    for (;;)
    {
        g_mutex_lock(process_lock);
            while ((g_atomic_int_get(&self->out->state) & (PLAY|OUT)) == 0)
            {
                if ((self->state & INPROCESS) == 0)
                {
                    GST_DEBUG("[%p]", self);
                    g_mutex_unlock(process_lock);
                    return NULL;
                }
                GST_DEBUG("[%p]", self);
                g_cond_wait(process_cond, process_lock);
            }
        g_mutex_unlock(process_lock);

        Espin *spin = self->out;
        gsize spin_size = g_memory_output_stream_get_data_size(spin->sound);

        GST_DEBUG("[%p] spin->sound_offset=%ld spin_size=%ld",
                self, spin->sound_offset, spin_size);

        if (g_atomic_int_get(&spin->state) == PLAY &&
                spin->sound_offset >= spin_size)
        {
            g_atomic_int_set(&spin->state, IN);
            process_push(self);
            spinning(self->queue, &self->out);
            continue;
        }

        return play(self, spin, size_to_play);
    }

    GST_DEBUG("[%p]", self);
    return NULL;
}

void
espeak_reset(Econtext *self)
{
    process_pop(self);

    GstBuffer *buf;
    while ((buf = espeak_out(self, SYNC_BUFFER_SIZE)) != NULL)
        gst_buffer_unref(buf);

    int i;
    for (i = SPIN_QUEUE_SIZE; i--;)
        g_atomic_int_set(&self->queue[i].state, IN);

    if (self->text)
    {
        g_free(self->text);
        self->text = NULL;
    }
}

// espeak ----------------------------------------------------------------------

static gint
synth_cb(short *data, int numsamples, espeak_EVENT *events)
{
    if (data == NULL)
        return 0;

    Espin *spin = events->user_data;

    if (numsamples > 0)
    {
        g_output_stream_write(G_OUTPUT_STREAM(spin->sound), data, numsamples*2,
                NULL, NULL);

        espeak_EVENT *i;
        for (i = events; i->type != espeakEVENT_LIST_TERMINATED; ++i)
        {
            if (i->type == espeakEVENT_WORD)
                --i->text_position;
            else if (i->type == espeakEVENT_MARK)
            {
                gchar *pos = spin->context->text + i->text_position;
                gint turn = 0;

                for (; pos > spin->context->text; --pos)
                    if (*pos == '"')
                    {
                        if (turn++ == 0)
                            *pos = 0;
                        else
                        {
                            i->id.name = pos + 1;
                            break;
                        }
                    }
            }

            GST_DEBUG("type=%d text_position=%d length=%d "
                      "audio_position=%d sample=%d",
                    i->type, i->text_position, i->length,
                    i->audio_position, i->sample*2);

            g_array_append_val(spin->events, *i);
        }
    }

    GST_DEBUG("numsamples=%d", numsamples*2);

    return 0;
}

static void
synth(Econtext *self, Espin *spin)
{
    g_seekable_seek(G_SEEKABLE(spin->sound), 0, G_SEEK_SET,
            NULL, NULL);
    g_array_set_size(spin->events, 0);
    spin->sound_offset = 0;
    spin->events_pos = 0;
    spin->mark_offset = -1;
    spin->mark_name = NULL;
    spin->last_word = -1;

    espeak_SetParameter(espeakPITCH, g_atomic_int_get(&self->pitch), 0);
    espeak_SetParameter(espeakRATE, g_atomic_int_get(&self->rate), 0);
    espeak_SetVoiceByName((gchar*)g_atomic_pointer_get(&self->voice));
    espeak_SetParameter(espeakWORDGAP, g_atomic_int_get(&self->gap), 0);

    gint track = g_atomic_int_get(&self->track);

    gint flags = espeakCHARS_UTF8;
    if (track == ESPEAK_TRACK_MARK)
        flags |= espeakSSML;

    GST_DEBUG("[%p] text_offset=%ld", self, self->text_offset);

    espeak_Synth(self->text, self->text_len + 1, 0, POS_CHARACTER, 0, flags,
            NULL, spin);

    if (spin->events->len)
    {
        self->text_offset = g_array_index(spin->events,
                espeak_EVENT, spin->events->len-1).text_position + 1;
    }

    espeak_EVENT last_event = { espeakEVENT_LIST_TERMINATED };
    last_event.sample = g_memory_output_stream_get_data_size(spin->sound) / 2;
    g_array_append_val(spin->events, last_event);
}

gint
espeak_get_sample_rate()
{
    return espeak_sample_rate;
}

GValueArray*
espeak_get_voices()
{
    init();
    return g_value_array_copy(espeak_voices);
}

void
espeak_set_pitch(Econtext *self, gint value)
{
    if (value == 0)
        value = 50;
    else
        value = MIN(99, (value + 100) / 2);

    g_atomic_int_set(&self->pitch, value);
}

void
espeak_set_rate(Econtext *self, gint value)
{
    if (value == 0)
        value = 170;
    else
        if (value < 0)
            value = MAX(80, value + 170);
        else
            value = 170 + value * 2;

    g_atomic_int_set(&self->rate, value);
}

void
espeak_set_voice(Econtext *self, const gchar *value)
{
    g_atomic_pointer_set(&self->voice, value);
}

void
espeak_set_gap(Econtext *self, guint value)
{
    g_atomic_int_set(&self->gap, value);
}

void
espeak_set_track(Econtext *self, guint value)
{
    g_atomic_int_set(&self->track, value);
}

// process ----------------------------------------------------------------------

static gpointer
process(gpointer data)
{
    g_mutex_lock(process_lock);

    for (;;)
    {
        while (process_queue == NULL)
            g_cond_wait(process_cond, process_lock);

        while (process_queue)
        {
            Econtext *context = (Econtext*)process_queue->data;
            Espin *spin = context->in;

            process_queue = g_slist_remove_link(process_queue, process_queue);

            if (context->text_offset >= context->text_len)
            {
                GST_DEBUG("[%p]", context);
                context->state &= ~INPROCESS;
            }
            else
            {
                synth(context, spin);
                g_atomic_int_set(&spin->state, OUT);
                spinning(context->queue, &context->in);

                if (g_atomic_int_get(&context->in->state) == IN)
                {
                    GST_DEBUG("[%p]", context);
                    process_queue = g_slist_concat(process_queue,
                            context->process_chunk);
                }
                else
                {
                    GST_DEBUG("[%p]", context);
                    context->state &= ~INPROCESS;
                }
            }
        }

        g_cond_broadcast(process_cond);
    }

    g_mutex_unlock(process_lock);

    return NULL;
}

static void
process_push(Econtext *context)
{
    GST_DEBUG("[%p]", context);
    g_mutex_lock(process_lock);

    if ((context->state & INPROCESS) == 0)
    {
        context->state |= INPROCESS;
        process_queue = g_slist_concat(process_queue, context->process_chunk);
        g_cond_broadcast(process_cond);
    }

    g_mutex_unlock(process_lock);
    GST_DEBUG("[%p]", context);
}

static void
process_pop(Econtext *context)
{
    GST_DEBUG("[%p]", context);
    g_mutex_lock(process_lock);

    process_queue = g_slist_remove_link(process_queue, context->process_chunk);
    context->state &= ~INPROCESS;
    g_cond_broadcast(process_cond);

    g_mutex_unlock(process_lock);
    GST_DEBUG("[%p]", context);
}

// -----------------------------------------------------------------------------

static void
init()
{
    static volatile gsize initialized = 0;

    if (initialized == 0)
    {
        ++initialized;

        process_lock = g_mutex_new();
        process_cond = g_cond_new();
        process_tid = g_thread_create(process, NULL, FALSE, NULL);

        espeak_sample_rate = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
                SYNC_BUFFER_SIZE, NULL, 0);
        espeak_SetSynthCallback(synth_cb);

        gsize count = 0;
        const espeak_VOICE **i;
        const espeak_VOICE **voices = espeak_ListVoices(NULL);

        for (i = voices; *i; ++i)
            ++count;
        espeak_voices = g_value_array_new(count);

        for (i = voices; *i; ++i)
        {
            GValueArray *voice = g_value_array_new(2);

            GValue name = { 0 };
            g_value_init(&name, G_TYPE_STRING);
            g_value_set_static_string(&name, (*i)->name);
            g_value_array_append(voice, &name);

            char *dialect_str = strchr((*i)->languages + 1, '-');
            if (dialect_str) *dialect_str++ = 0;

            GValue lang = { 0 };
            g_value_init(&lang, G_TYPE_STRING);
            g_value_set_static_string(&lang, (*i)->languages + 1);
            g_value_array_append(voice, &lang);

            GValue dialect = { 0 };
            g_value_init(&dialect, G_TYPE_STRING);
            g_value_set_static_string(&dialect, dialect_str ? dialect_str : "none");
            g_value_array_append(voice, &dialect);

            GValue voice_value = { 0 };
            g_value_init(&voice_value, G_TYPE_VALUE_ARRAY);
            g_value_set_boxed_take_ownership(&voice_value, voice);
            g_value_array_append(espeak_voices, &voice_value);
            g_value_unset(&voice_value);
        }
    }
}