C++程序  |  439行  |  13.53 KB

/*
 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/**
 * @file picokfst.c
 *
 * FST knowledge loading and access
 *
 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
 * All rights reserved.
 *
 * History:
 * - 2009-04-20 -- initial version
 *
 */
#include "picoos.h"
#include "picodbg.h"
#include "picoknow.h"
#include "picokfst.h"

#ifdef __cplusplus
extern "C" {
#endif
#if 0
}
#endif


#define FileHdrSize 4       /* size of FST file header */



/* ************************************************************/
/* function to create specialized kb, */
/* to be used by picorsrc only */
/* ************************************************************/

/** object       : FSTKnowledgeBase
 *  shortcut     : kfst
 *  derived from : picoknow_KnowledgeBase
 */

typedef struct kfst_subobj * kfst_SubObj;

typedef struct kfst_subobj{
    picoos_uint8 * fstStream;         /* the byte stream base address */
    picoos_int32 hdrLen;              /* length of file header */
    picoos_int32 transductionMode;    /* transduction mode to be used for FST */
    picoos_int32 nrClasses;           /* nr of pair/transition classes in FST; class is in [1..nrClasses] */
    picoos_int32 nrStates;            /* nr of states in FST; state is in [1..nrState] */
    picoos_int32 termClass;           /* pair class of terminator symbol pair; probably obsolete */
    picoos_int32 alphaHashTabSize;    /* size of pair alphabet hash table */
    picoos_int32 alphaHashTabPos;     /* absolute address of the start of the pair alphabet */
    picoos_int32 transTabEntrySize;   /* size in bytes of each transition table entry */
    picoos_int32 transTabPos;         /* absolute address of the start of the transition table */
    picoos_int32 inEpsStateTabPos;    /* absolute address of the start of the input epsilon transition table */
    picoos_int32 accStateTabPos;      /* absolute address of the table of accepting states */
} kfst_subobj_t;



/* ************************************************************/
/* primitives for reading from byte stream */
/* ************************************************************/

/* Converts 'nrBytes' bytes starting at position '*pos' in byte stream 'stream' into unsigned number 'num'.
   '*pos' is modified to the position right after the number */
static void FixedBytesToUnsignedNum (picoos_uint8 * stream, picoos_uint8 nrBytes, picoos_uint32 * pos, picoos_uint32 * num)
{
    picoos_int32 i;

    (*num) = 0;
    for (i = 0; i < nrBytes; i++) {
        (*num) = ((*num) << 8) + (picoos_uint32)stream[*pos];
        (*pos)++;
    }
}


/* Converts 'nrBytes' bytes starting at position '*pos' in byte stream 'stream' into signed number 'num'.
   '*pos' is modified to the position right after the number */
static void FixedBytesToSignedNum (picoos_uint8 * stream, picoos_uint8 nrBytes, picoos_uint32 * pos, picoos_int32 * num)
{
    picoos_int32 i;
    picoos_uint32 val;

    val = 0;
    for (i = 0; i < nrBytes; i++) {
        val = (val << 8) + (picoos_uint32)stream[*pos];
        (*pos)++;
    }
    if (val % 2 == 1) {
        /* negative number */
        (*num) = -((picoos_int32)((val - 1) / 2)) - 1;
    } else {
        /* positive number */
        (*num) = val / 2;
    }
}


/* Converts varying-sized sequence of bytes starting at position '*pos' in byte stream 'stream'
   into (signed) number 'num'. '*pos' is modified to the position right after the number. */
static void BytesToNum (picoos_uint8 * stream, picoos_uint32 * pos, picoos_int32 * num)
{
    picoos_uint32 val;
    picoos_uint32 b;

    val = 0;
    b = (picoos_uint32)stream[*pos];
    (*pos)++;
    while (b < 128) {
        val = (val << 7) + b;
        b = (picoos_uint32)stream[*pos];
        (*pos)++;
    }
    val = (val << 7) + (b - 128);
    if (val % 2 == 1) {
        /* negative number */
        (*num) = -((picoos_int32)((val - 1) / 2)) - 1;
    } else {
        /* positive number */
        (*num) = val / 2;
    }
}


/* ************************************************************/
/* setting up FST from byte stream */
/* ************************************************************/

static pico_status_t kfstInitialize(register picoknow_KnowledgeBase this,
        picoos_Common common)
{
    picoos_uint32 curpos;
    picoos_int32 offs;
    kfst_subobj_t * kfst;

    PICODBG_DEBUG(("kfstInitialize -- start\n"));

    if (NULL == this || NULL == this->subObj) {
        return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL,
                NULL);
    }
    kfst = (kfst_subobj_t *) this->subObj;

    /* +CT+ */
    kfst->fstStream = this->base;
    PICODBG_TRACE(("base: %d\n",this->base));
    kfst->hdrLen = FileHdrSize;
    curpos = kfst->hdrLen;
    BytesToNum(kfst->fstStream,& curpos,& kfst->transductionMode);
    BytesToNum(kfst->fstStream,& curpos,& kfst->nrClasses);
    BytesToNum(kfst->fstStream,& curpos,& kfst->nrStates);
    BytesToNum(kfst->fstStream,& curpos,& kfst->termClass);
    BytesToNum(kfst->fstStream,& curpos,& kfst->alphaHashTabSize);
    BytesToNum(kfst->fstStream,& curpos,& offs);
    kfst->alphaHashTabPos = kfst->hdrLen + offs;
    BytesToNum(kfst->fstStream,& curpos,& kfst->transTabEntrySize);
    BytesToNum(kfst->fstStream,& curpos,& offs);
    kfst->transTabPos = kfst->hdrLen + offs;
    BytesToNum(kfst->fstStream,& curpos,& offs);
    kfst->inEpsStateTabPos = kfst->hdrLen + offs;
    BytesToNum(kfst->fstStream,& curpos,& offs);
    kfst->accStateTabPos = kfst->hdrLen + offs;
    /* -CT- */

    return PICO_OK;
}


static pico_status_t kfstSubObjDeallocate(register picoknow_KnowledgeBase this,
        picoos_MemoryManager mm)
{
    if (NULL != this) {
        picoos_deallocate(mm, (void *) &this->subObj);
    }
    return PICO_OK;
}


/* calculates a small number of data (e.g. addresses) from kb for fast access.
 * This data is encapsulated in a picokfst_FST that can later be retrieved
 * with picokfst_getFST. */
pico_status_t picokfst_specializeFSTKnowledgeBase(picoknow_KnowledgeBase this,
                                                  picoos_Common common)
{
    pico_status_t status;

    if (NULL == this) {
        return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL, NULL);
    }
    if (0 < this->size) {
        /* not a dummy kb */
        this->subDeallocate = kfstSubObjDeallocate;

        this->subObj = picoos_allocate(common->mm, sizeof(kfst_subobj_t));

        if (NULL == this->subObj) {
            return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, NULL, NULL);
        }
        status = kfstInitialize(this, common);
        if (PICO_OK != status) {
            picoos_deallocate(common->mm,(void **)&this->subObj);
        }
    }
    return PICO_OK;
}


/* ************************************************************/
/* FST type and getFST function */
/* ************************************************************/



/* return kb FST for usage in PU */
picokfst_FST picokfst_getFST(picoknow_KnowledgeBase this)
{
    if (NULL == this) {
        return NULL;
    } else {
        return (picokfst_FST) this->subObj;
    }
}



/* ************************************************************/
/* FST access methods */
/* ************************************************************/


/* see description in header file */
extern picoos_uint8 picokfst_kfstGetTransductionMode(picokfst_FST this)
{
    kfst_SubObj fst = (kfst_SubObj) this;
    if (fst != NULL) {
        return fst->transductionMode;
    } else {
        return 0;
    }
}


/* see description in header file */
extern void picokfst_kfstGetFSTSizes (picokfst_FST this, picoos_int32 *nrStates, picoos_int32 *nrClasses)
{
    kfst_SubObj fst = (kfst_SubObj) this;
    if (fst != NULL) {
        *nrStates = fst->nrStates;
        *nrClasses = fst->nrClasses;
    } else {
        *nrStates = 0;
        *nrClasses = 0;
    }
}

/* see description in header file */
extern void picokfst_kfstStartPairSearch (picokfst_FST this, picokfst_symid_t inSym,
                                          picoos_bool * inSymFound, picoos_int32 * searchState)
{
    picoos_uint32 pos;
    picoos_int32 offs;
    picoos_int32 h;
    picoos_int32 inSymCellPos;
    picoos_int32 inSymX;
    picoos_int32 nextSameHashInSymOffs;

    kfst_SubObj fst = (kfst_SubObj) this;
    (*searchState) =  -1;
    (*inSymFound) = 0;
    h = inSym % fst->alphaHashTabSize;
    pos = fst->alphaHashTabPos + (h * 4);
    FixedBytesToSignedNum(fst->fstStream,4,& pos,& offs);
    if (offs > 0) {
        inSymCellPos = fst->alphaHashTabPos + offs;
        pos = inSymCellPos;
        BytesToNum(fst->fstStream,& pos,& inSymX);
        BytesToNum(fst->fstStream,& pos,& nextSameHashInSymOffs);
        while ((inSymX != inSym) && (nextSameHashInSymOffs > 0)) {
            inSymCellPos = inSymCellPos + nextSameHashInSymOffs;
            pos = inSymCellPos;
            BytesToNum(fst->fstStream,& pos,& inSymX);
            BytesToNum(fst->fstStream,& pos,& nextSameHashInSymOffs);
        }
        if (inSymX == inSym) {
            /* input symbol found; state is set to position after symbol cell */
            (*searchState) = pos;
            (*inSymFound) = 1;
        }
    }
}


/* see description in header file */
extern void picokfst_kfstGetNextPair (picokfst_FST this, picoos_int32 * searchState,
                                      picoos_bool * pairFound,
                                      picokfst_symid_t * outSym, picokfst_class_t * pairClass)
{
    picoos_uint32 pos;
    picoos_int32 val;

    kfst_SubObj fst = (kfst_SubObj) this;
    if ((*searchState) < 0) {
        (*pairFound) = 0;
        (*outSym) = PICOKFST_SYMID_ILLEG;
        (*pairClass) =  -1;
    } else {
        pos = (*searchState);
        BytesToNum(fst->fstStream,& pos,& val);
        *outSym = (picokfst_symid_t)val;
        if ((*outSym) != PICOKFST_SYMID_ILLEG) {
            BytesToNum(fst->fstStream,& pos,& val);
            *pairClass = (picokfst_class_t)val;
            (*pairFound) = 1;
            (*searchState) = pos;
        } else {
            (*pairFound) = 0;
            (*outSym) = PICOKFST_SYMID_ILLEG;
            (*pairClass) =  -1;
            (*searchState) =  -1;
        }
    }
}



/* see description in header file */
extern void picokfst_kfstGetTrans (picokfst_FST this, picokfst_state_t startState, picokfst_class_t transClass,
                                   picokfst_state_t * endState)
{

    picoos_uint32 pos;
    picoos_int32 index;
    picoos_uint32 endStateX;

    kfst_SubObj fst = (kfst_SubObj) this;
    if ((startState < 1) || (startState > fst->nrStates) || (transClass < 1) || (transClass > fst->nrClasses)) {
        (*endState) = 0;
    } else {
        index = (startState - 1) * fst->nrClasses + transClass - 1;
        pos = fst->transTabPos + (index * fst->transTabEntrySize);
        FixedBytesToUnsignedNum(fst->fstStream,fst->transTabEntrySize,& pos,& endStateX);
        (*endState) = endStateX;
    }
}


/* see description in header file */
extern void picokfst_kfstStartInEpsTransSearch (picokfst_FST this, picokfst_state_t startState,
                                                picoos_bool * inEpsTransFound, picoos_int32 * searchState)
{

    picoos_int32 offs;
    picoos_uint32 pos;

    kfst_SubObj fst = (kfst_SubObj) this;
    (*searchState) =  -1;
    (*inEpsTransFound) = 0;
    if ((startState > 0) && (startState <= fst->nrStates)) {
        pos = fst->inEpsStateTabPos + (startState - 1) * 4;
        FixedBytesToSignedNum(fst->fstStream,4,& pos,& offs);
        if (offs > 0) {
            (*searchState) = fst->inEpsStateTabPos + offs;
            (*inEpsTransFound) = 1;
        }
    }
}



/* see description in header file */
extern void picokfst_kfstGetNextInEpsTrans (picokfst_FST this, picoos_int32 * searchState,
                                            picoos_bool * inEpsTransFound,
                                            picokfst_symid_t * outSym, picokfst_state_t * endState)
{
    picoos_uint32 pos;
    picoos_int32 val;

    kfst_SubObj fst = (kfst_SubObj) this;
    if ((*searchState) < 0) {
        (*inEpsTransFound) = 0;
        (*outSym) = PICOKFST_SYMID_ILLEG;
        (*endState) = 0;
    } else {
        pos = (*searchState);
        BytesToNum(fst->fstStream,& pos,& val);
        *outSym = (picokfst_symid_t)val;
        if ((*outSym) != PICOKFST_SYMID_ILLEG) {
            BytesToNum(fst->fstStream,& pos,& val);
            *endState = (picokfst_state_t)val;
            (*inEpsTransFound) = 1;
            (*searchState) = pos;
        } else {
            (*inEpsTransFound) = 0;
            (*outSym) = PICOKFST_SYMID_ILLEG;
            (*endState) = 0;
            (*searchState) =  -1;
        }
    }
}


/* see description in header file */
extern picoos_bool picokfst_kfstIsAcceptingState (picokfst_FST this, picokfst_state_t state)
{

    picoos_uint32 pos;
    picoos_uint32 val;

    kfst_SubObj fst = (kfst_SubObj) this;
    if ((state > 0) && (state <= fst->nrStates)) {
        pos = fst->accStateTabPos + (state - 1);
        FixedBytesToUnsignedNum(fst->fstStream,1,& pos,& val);
        return (val == 1);
    } else {
        return 0;
    }
}

#ifdef __cplusplus
}
#endif

/* End picofst.c */