C++程序  |  583行  |  22.64 KB

/*
 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/**
 * @file picowa.c
 *
 * word analysis PU - lexicon lookup and POS prediction
 *
 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
 * All rights reserved.
 *
 * History:
 * - 2009-04-20 -- initial version
 *
 */

#include "picoos.h"
#include "picodbg.h"
#include "picodata.h"
#include "picowa.h"
#include "picoklex.h"
#include "picokdt.h"
#include "picoktab.h"

#ifdef __cplusplus
extern "C" {
#endif
#if 0
}
#endif

/* PU waStep states */
#define WA_STEPSTATE_COLLECT  0
#define WA_STEPSTATE_PROCESS  1
#define WA_STEPSTATE_FEED     2


/*  subobject    : WordAnaUnit
 *  shortcut     : wa
 *  context size : one item
 */
typedef struct wa_subobj {
    picoos_uint8 procState; /* for next processing step decision */

    /* one item only */
    picoos_uint8 inBuf[PICOWA_MAXITEMSIZE]; /* internal input buffer */
    picoos_uint16 inBufSize; /* actually allocated size */
    picoos_uint16 inLen; /* length of item in inBuf, 0 for empty buf */

    picoos_uint8 outBuf[PICOWA_MAXITEMSIZE]; /* internal output buffer */
    picoos_uint16 outBufSize; /* actually allocated size */
    picoos_uint16 outLen; /* length of item in outBuf, 0 for empty buf */

    /* lex knowledge base */
    picoklex_Lex lex;

    /* ulex knowledge bases */
    picoos_uint8 numUlex;
    picoklex_Lex ulex[PICOKNOW_MAX_NUM_ULEX];

    /* tab knowledge base */
    picoktab_Pos tabpos;

    /* dtposp knowledge base */
    picokdt_DtPosP dtposp;
} wa_subobj_t;


static pico_status_t waInitialize(register picodata_ProcessingUnit this, picoos_int32 r_mode) {
    picoos_uint8 i;
    picoklex_Lex ulex;
    wa_subobj_t * wa;

    picoknow_kb_id_t ulexKbIds[PICOKNOW_MAX_NUM_ULEX] = PICOKNOW_KBID_ULEX_ARRAY;

    PICODBG_DEBUG(("calling"));

    if (NULL == this || NULL == this->subObj) {
        return (picodata_step_result_t) picoos_emRaiseException(this->common->em,
                                       PICO_ERR_NULLPTR_ACCESS, NULL, NULL);
    }
    wa = (wa_subobj_t *) this->subObj;
    wa->procState = WA_STEPSTATE_COLLECT;
    wa->inBufSize = PICOWA_MAXITEMSIZE;
    wa->inLen = 0;
    wa->outBufSize = PICOWA_MAXITEMSIZE;
    wa->outLen = 0;

    if (r_mode == PICO_RESET_SOFT) {
        /*following initializations needed only at startup or after a full reset*/
        return PICO_OK;
    }
    /* kb lex */
    wa->lex = picoklex_getLex(this->voice->kbArray[PICOKNOW_KBID_LEX_MAIN]);
    if (wa->lex == NULL) {
        return picoos_emRaiseException(this->common->em, PICO_EXC_KB_MISSING,
                                       NULL, NULL);
    }
    PICODBG_DEBUG(("got lex"));

    /* kb ulex[] */
    wa->numUlex = 0;
    for (i = 0; i<PICOKNOW_MAX_NUM_ULEX; i++) {
        ulex = picoklex_getLex(this->voice->kbArray[ulexKbIds[i]]);
        if (NULL != ulex) {
            wa->ulex[wa->numUlex++] = ulex;
        }
    }
    PICODBG_DEBUG(("got %i user lexica", wa->numUlex));

    /* kb tabpos */
    wa->tabpos =
        picoktab_getPos(this->voice->kbArray[PICOKNOW_KBID_TAB_POS]);
    if (wa->tabpos == NULL) {
        return picoos_emRaiseException(this->common->em, PICO_EXC_KB_MISSING,
                                       NULL, NULL);
    }
    PICODBG_DEBUG(("got tabpos"));

    /* kb dtposp */
    wa->dtposp = picokdt_getDtPosP(this->voice->kbArray[PICOKNOW_KBID_DT_POSP]);
    if (wa->dtposp == NULL) {
        return picoos_emRaiseException(this->common->em, PICO_EXC_KB_MISSING,
                                       NULL, NULL);
    }
    PICODBG_DEBUG(("got dtposp"));
    return PICO_OK;
}

static picodata_step_result_t waStep(register picodata_ProcessingUnit this,
                                     picoos_int16 mode,
                                     picoos_uint16 *numBytesOutput);

static pico_status_t waTerminate(register picodata_ProcessingUnit this) {
    return PICO_OK;
}

static pico_status_t waSubObjDeallocate(register picodata_ProcessingUnit this,
                                        picoos_MemoryManager mm) {
    if (NULL != this) {
        picoos_deallocate(this->common->mm, (void *) &this->subObj);
    }
    mm = mm;        /* avoid warning "var not used in this function"*/
    return PICO_OK;
}


picodata_ProcessingUnit picowa_newWordAnaUnit(picoos_MemoryManager mm,
                                              picoos_Common common,
                                              picodata_CharBuffer cbIn,
                                              picodata_CharBuffer cbOut,
                                              picorsrc_Voice voice) {
    picodata_ProcessingUnit this;

    this = picodata_newProcessingUnit(mm, common, cbIn, cbOut, voice);
    if (this == NULL) {
        return NULL;
    }

    this->initialize = waInitialize;
    PICODBG_DEBUG(("set this->step to waStep"));
    this->step = waStep;
    this->terminate = waTerminate;
    this->subDeallocate = waSubObjDeallocate;
    this->subObj = picoos_allocate(mm, sizeof(wa_subobj_t));
    if (this->subObj == NULL) {
        picoos_deallocate(mm, (void *)&this);
        picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, NULL, NULL);
        return NULL;
    }

    waInitialize(this, PICO_RESET_FULL);
    return this;
}

/* ***********************************************************************/
/*                       WORDGRAPH proc functions                        */
/* ***********************************************************************/

static picoos_uint8 waClassifyPos(register picodata_ProcessingUnit this,
                                  register wa_subobj_t *wa,
                                  const picoos_uint8 *graph,
                                  const picoos_uint16 graphlen) {
    picokdt_classify_result_t dtres;
    picoos_uint8 specchar;
    picoos_uint16 i;

    PICODBG_DEBUG(("graphlen %d", graphlen));

    /* check existence of special char (e.g. hyphen) in graph:
       for now, check existence of hard-coded ascii hyphen,
       ie. preproc needs to match all UTF8 hyphens to the ascii
       hyphen. */
    /*  @todo : consider specifying special char(s) in lingware. */
    specchar = FALSE;
    i = 0;
    while ((i < graphlen) && (!specchar)) {
        if (graph[i++] == '-') {
            specchar = TRUE;
        }
    }

    /* construct input vector, which is set in dtposp */
    if (!picokdt_dtPosPconstructInVec(wa->dtposp, graph, graphlen, specchar)) {
        /* error constructing invec */
        PICODBG_WARN(("problem with invec"));
        picoos_emRaiseWarning(this->common->em, PICO_WARN_INVECTOR, NULL, NULL);
        return PICODATA_ITEMINFO1_ERR;
    }

    /* classify */
    if (!picokdt_dtPosPclassify(wa->dtposp)) {
        /* error doing classification */
        PICODBG_WARN(("problem classifying"));
        picoos_emRaiseWarning(this->common->em, PICO_WARN_CLASSIFICATION,
                              NULL, NULL);
        return PICODATA_ITEMINFO1_ERR;
    }

    /* decompose */
    if (!picokdt_dtPosPdecomposeOutClass(wa->dtposp, &dtres)) {
        /* error decomposing */
        PICODBG_WARN(("problem decomposing"));
        picoos_emRaiseWarning(this->common->em, PICO_WARN_OUTVECTOR,
                              NULL, NULL);
        return PICODATA_ITEMINFO1_ERR;
    }

    if (dtres.set) {
        PICODBG_DEBUG(("class %d", dtres.class));
        return (picoos_uint8)dtres.class;
    } else {
        PICODBG_WARN(("result not set"));
        picoos_emRaiseWarning(this->common->em, PICO_WARN_CLASSIFICATION,
                              NULL, NULL);
        return PICODATA_ITEMINFO1_ERR;
    }
}


static pico_status_t waProcessWordgraph(register picodata_ProcessingUnit this,
                                        register wa_subobj_t *wa /*inout*/,
                                        picodata_itemhead_t *head /*inout*/,
                                        const picoos_uint8 *content) {
    pico_status_t status;
    picoklex_lexl_result_t lexres;
    picoos_uint8 posbuf[PICOKTAB_MAXNRPOS_IN_COMB];
    picoos_uint8 i;
    picoos_uint8 foundIndex;
    picoos_bool found;


    PICODBG_DEBUG(("type %c, len %d", head->type, head->len));

    /* do lookup
       if no entry found:
         do POS prediction:     -> WORDGRAPH(POSes,NA)graph
       else:
         if incl-phone:
           N entries possible  -> WORDINDEX(POSes,NA)POS1|ind1...POSN|indN
           (N in {1,...,PICOKLEX_MAX_NRRES}, now up to 4)
         else:
           no phone, one entry  -> WORDGRAPH(POS,NA)graph
    */

    found = FALSE;
    i = 0;
    while (!found && (i < wa->numUlex)) {
        found = picoklex_lexLookup(wa->ulex[i], content, head->len, &lexres);
        i++;
    }
    /* note that if found, i will be incremented nevertheless, so i >= 1 */
    if (found) {
        foundIndex = i;
    } else {
        foundIndex = 0;
    }
    if (!found && !picoklex_lexLookup(wa->lex, content, head->len, &lexres)) {
        /* no lex entry found, WORDGRAPH(POS,NA)graph */
        if (PICO_OK == picodata_copy_item(wa->inBuf, wa->inLen,
                                          wa->outBuf, wa->outBufSize,
                                          &wa->outLen)) {
            wa->inLen = 0;
            /* predict and modify pos in info1 */
            if (PICO_OK != picodata_set_iteminfo1(wa->outBuf, wa->outLen,
                                   waClassifyPos(this, wa, content, head->len))) {
                return picoos_emRaiseException(this->common->em,
                                               PICO_EXC_BUF_OVERFLOW,NULL,NULL);
            }
        }

    } else {    /* at least one entry found */
        PICODBG_DEBUG(("at least one entry found in lexicon %i",foundIndex));
        if (lexres.phonfound) {    /* incl. ind-phone and possibly multi-ent. */
            if (lexres.nrres > PICOKLEX_MAX_NRRES) {
                /* not possible with system lexicon, needs to be
                   ensured for user lex too */
                picoos_emRaiseWarning(this->common->em, PICO_WARN_FALLBACK,NULL,
                        (picoos_char *)"using %d lexicon lookup results",
                        PICOKLEX_MAX_NRRES);
                lexres.nrres = PICOKLEX_MAX_NRRES;
            }
            head->type = PICODATA_ITEM_WORDINDEX;
            if (lexres.nrres == 1) {
                head->info1 = lexres.posind[0];
            } else {
                /* more than one result, POSgroup info needs to be
                   determined for later POS disambiguation */
                for (i = 0; i < lexres.nrres; i++) {
                    posbuf[i] = lexres.posind[i * PICOKLEX_POSIND_SIZE];
                }
                head->info1 = picoktab_getPosGroup(wa->tabpos, posbuf,
                                                   lexres.nrres);
            }
            head->info2 = foundIndex;
            head->len = lexres.posindlen;
            if ((status = picodata_put_itemparts(head, lexres.posind,
                                                 lexres.posindlen,
                                                 wa->outBuf, wa->outBufSize,
                                                 &wa->outLen)) == PICO_OK) {
                wa->inLen = 0;
            } else {
                return picoos_emRaiseException(this->common->em, status,
                                               NULL, NULL);
            }

        } else {    /* no phone, :G2P, one entry: WORDGRAPH(POS,NA)graph */
            if (PICO_OK == picodata_copy_item(wa->inBuf, wa->inLen,
                                              wa->outBuf, wa->outBufSize,
                                              &wa->outLen)) {
                wa->inLen = 0;
                /* set lex pos in info1 */
                if (PICO_OK != picodata_set_iteminfo1(wa->outBuf, wa->outLen,
                                                      lexres.posind[0])) {
                    return picoos_emRaiseException(this->common->em,
                                                   PICO_EXC_BUF_OVERFLOW,
                                                   NULL, NULL);
                }
            }
        }
    }
    return PICO_OK;
}


/* ***********************************************************************/
/*                          waStep function                              */
/* ***********************************************************************/

/*
   collect into internal buffer, process, and then feed to output buffer

   init state: COLLECT      ext      ext
   state transitions:       in IN OUTout
   COLLECT | getOneItem  ->-1 +1  0  0   | (ATOMIC) -> PROCESS (got item)
   COLLECT | getOneItem  -> 0  0  0  0   | IDLE                (got no item)

   PROCESS | procOneItem -> 0 -1 +1  0   | (ATOMIC) -> FEED    (proc'ed item)
   PROCESS | procOneItem -> 0 -1  0  0   | BUSY     -> COLLECT (item skipped)

   FEED    | putOneItem  -> 0  0 -1 +1   | BUSY     -> COLLECT (put item)
   FEED    | putOneItem  -> 0  0  1  0   | OUT_FULL            (put no item)
*/

static picodata_step_result_t waStep(register picodata_ProcessingUnit this,
                                     picoos_int16 mode,
                                     picoos_uint16 * numBytesOutput) {
    register wa_subobj_t *wa;
    pico_status_t rv = PICO_OK;

    if (NULL == this || NULL == this->subObj) {
        return PICODATA_PU_ERROR;
    }
    wa = (wa_subobj_t *) this->subObj;
    mode = mode;        /* avoid warning "var not used in this function"*/
    *numBytesOutput = 0;
    while (1) { /* exit via return */
        PICODBG_DEBUG(("doing state %i, inLen: %d, outLen: %d",
                       wa->procState, wa->inLen, wa->outLen));

        switch (wa->procState) {
            /* collect state: get item from charBuf and store in
             * internal inBuf
             */
            case WA_STEPSTATE_COLLECT:
                if (wa->inLen == 0) { /* is input buffer empty? */
                    picoos_uint16 blen;
                    /* try to get one item */
                    rv = picodata_cbGetItem(this->cbIn, wa->inBuf,
                                            wa->inBufSize, &blen);
                    PICODBG_DEBUG(("after getting item, status: %d", rv));
                    if (PICO_OK == rv) {
                        /* we now have one item */
                        wa->inLen = blen;
                        wa->procState = WA_STEPSTATE_PROCESS;
                        /* uncomment next line to split into two steps */
                        /* return PICODATA_PU_ATOMIC; */
                    } else if (PICO_EOF == rv) {
                        /* there was no item in the char buffer */
                        return PICODATA_PU_IDLE;
                    } else if ((PICO_EXC_BUF_UNDERFLOW == rv)
                               || (PICO_EXC_BUF_OVERFLOW == rv)) {
                        PICODBG_ERROR(("problem getting item"));
                        picoos_emRaiseException(this->common->em, rv,
                                                NULL, NULL);
                        return PICODATA_PU_ERROR;
                    } else {
                        PICODBG_ERROR(("problem getting item, unhandled"));
                        picoos_emRaiseException(this->common->em, rv,
                                                NULL, NULL);
                        return PICODATA_PU_ERROR;
                    }
                } else { /* there already is an item in the input buffer */
                    PICODBG_WARN(("item already in input buffer"));
                    picoos_emRaiseWarning(this->common->em,
                                          PICO_WARN_PU_IRREG_ITEM, NULL, NULL);
                    wa->procState = WA_STEPSTATE_PROCESS;
                    /* uncomment next to split into two steps */
                    /* return PICODATA_PU_ATOMIC; */
                }
                break;


            /* process state: process item in internal inBuf and put
             * result in internal outBuf
             */
            case WA_STEPSTATE_PROCESS:

                /* ensure there is an item in inBuf and it is valid */
                if ((wa->inLen > 0) && picodata_is_valid_item(wa->inBuf,
                                                              wa->inLen)) {
                    picodata_itemhead_t ihead;
                    picoos_uint8 *icontent;
                    pico_status_t rvP = PICO_OK;

                    rv = picodata_get_iteminfo(wa->inBuf, wa->inLen, &ihead,
                                               &icontent);
                    if (PICO_OK == rv) {

                        switch (ihead.type) {
                            case PICODATA_ITEM_WORDGRAPH:

                                if (0 < ihead.len) {
                                    rvP = waProcessWordgraph(this, wa, &ihead,
                                                             icontent);
                                } else {
                                    /* else ignore empty WORDGRAPH */
                                    wa->inLen = 0;
                                    wa->procState = WA_STEPSTATE_COLLECT;
                                    return PICODATA_PU_BUSY;
                                }
                                break;
                            case PICODATA_ITEM_OTHER:
                                /* skip item */
                                rvP = PICO_WARN_PU_DISCARD_BUF;
                                break;
                            default:
                                /* copy item unmodified */
                                rvP = picodata_copy_item(wa->inBuf,
                                                         wa->inLen, wa->outBuf,
                                                         wa->outBufSize, &wa->outLen);
                                break;
                        }

                        if (PICO_OK == rvP) {
                            wa->inLen = 0;
                            wa->procState = WA_STEPSTATE_FEED;
                            /* uncomment next to split into two steps */
                            /* return PICODATA_PU_ATOMIC; */
                        } else if (PICO_WARN_PU_DISCARD_BUF == rvP) {
                            /* discard input buffer and get a new item */
                            PICODBG_INFO(("skipping OTHER item"));
/*                            picoos_emRaiseWarning(this->common->em,
                                                  PICO_WARN_PU_DISCARD_BUF, NULL, NULL);
*/
                            wa->inLen = 0;
                            wa->procState = WA_STEPSTATE_COLLECT;
                            return PICODATA_PU_BUSY;
                        } else {
                            /* PICO_EXC_BUF_OVERFLOW   <- overflow in outbuf
                               PICO_ERR_OTHER          <- no valid item in inbuf
                               or return from processWordgraph
                            */
                            PICODBG_ERROR(("problem processing item", rvP));
                            picoos_emRaiseException(this->common->em, rvP,
                                                    NULL, NULL);
                            return PICODATA_PU_ERROR;
                        }

                    } else {    /* could not get iteminfo */
                        /* PICO_EXC_BUF_OVERFLOW   <- overflow in outbuf
                           PICO_ERR_OTHER          <- no valid item in inbuf
                        */
                        PICODBG_ERROR(("problem getting item info, "
                                       "discard buffer content"));
                        wa->inLen = 0;
                        wa->procState = WA_STEPSTATE_COLLECT;
                        picoos_emRaiseException(this->common->em, rv,
                                                NULL, NULL);
                        return PICODATA_PU_ERROR;
                    }

                } else if (wa->inLen == 0) {    /* no item in inBuf */
                    PICODBG_INFO(("no item in inBuf"));
                    /* wa->inLen = 0;*/
                    wa->procState = WA_STEPSTATE_COLLECT;
                    return PICODATA_PU_BUSY;

                } else {    /* no valid item in inBuf */
                    /* bad state/item, discard buffer content */
                    PICODBG_WARN(("no valid item, discard buffer content"));
                    picoos_emRaiseWarning(this->common->em,
                                          PICO_WARN_PU_IRREG_ITEM, NULL, NULL);
                    picoos_emRaiseWarning(this->common->em,
                                          PICO_WARN_PU_DISCARD_BUF, NULL, NULL);
                    wa->inLen = 0;
                    wa->procState = WA_STEPSTATE_COLLECT;
                    return PICODATA_PU_BUSY;
                }
                break;


            /* feed state: copy item in internal outBuf to output charBuf */
            case WA_STEPSTATE_FEED:

                /* check that item fits in cb should not be needed */
                rv = picodata_cbPutItem(this->cbOut, wa->outBuf,
                                        wa->outLen, numBytesOutput);

                PICODATA_INFO_ITEM(this->voice->kbArray[PICOKNOW_KBID_DBG],
                                   (picoos_uint8 *)"wana: ", wa->outBuf,
                                   wa->outLen);

                PICODBG_DEBUG(("put item, status: %d", rv));
                if (PICO_OK == rv) {
                    wa->outLen = 0;
                    wa->procState = WA_STEPSTATE_COLLECT;
                    return PICODATA_PU_BUSY;
                } else if (PICO_EXC_BUF_OVERFLOW == rv) {
                    PICODBG_INFO(("feeding, overflow, PICODATA_PU_OUT_FULL"));
                    return PICODATA_PU_OUT_FULL;
                } else if ((PICO_EXC_BUF_UNDERFLOW == rv)
                           || (PICO_ERR_OTHER == rv)) {
                    PICODBG_WARN(("feeding problem, discarding item"));
                    wa->outLen = 0;
                    wa->procState = WA_STEPSTATE_COLLECT;
                    picoos_emRaiseWarning(this->common->em, rv, NULL,NULL);
                    return PICODATA_PU_BUSY;
                }
                break;

            default:
                break;

        } /* switch */

    } /* while */

    /* should be never reached */
    PICODBG_ERROR(("reached end of function"));
    picoos_emRaiseException(this->common->em, PICO_ERR_OTHER, NULL, NULL);
    return PICODATA_PU_ERROR;
}

#ifdef __cplusplus
}
#endif


/* end */