antlr3filestream.c - Android社区 - https://www.androidos.net.cn/

/** \file
 * \brief The ANTLR3 C filestream is used when the source character stream
 * is a filesystem based input set and all the characters in the filestream
 * can be loaded at once into memory and away the lexer goes.
 *
 * A number of initializers are provided in order that various character
 * sets can be supported from input files. The ANTLR3 C runtime expects
 * to deal with UTF32 characters only (the reasons for this are to
 * do with the simplification of C code when using this form of Unicode 
 * encoding, though this is not a panacea. More information can be
 * found on this by consulting: 
 *   - http://www.unicode.org/versions/Unicode4.0.0/ch02.pdf#G11178
 * Where a well grounded discussion of the encoding formats available
 * may be found.
 *
 */

// [The "BSD licence"]
// Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
// http://www.temporal-wave.com
// http://www.linkedin.com/in/jimidle
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
// 3. The name of the author may not be used to endorse or promote products
//    derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include    <antlr3.h>

static  void                    setupInputStream            (pANTLR3_INPUT_STREAM input);
static  pANTLR3_INPUT_STREAM    antlr3CreateFileStream      (pANTLR3_UINT8 fileName);
static  pANTLR3_INPUT_STREAM    antlr3CreateStringStream    (pANTLR3_UINT8 data);

ANTLR3_API pANTLR3_INPUT_STREAM
antlr3FileStreamNew(pANTLR3_UINT8 fileName, ANTLR3_UINT32 encoding)
{
    pANTLR3_INPUT_STREAM input;

    // First order of business is to read the file into some buffer space
    // as just straight 8 bit bytes. Then we will work out the encoding and
    // byte order and adjust the API functions that are installed for the
    // default 8Bit stream accordingly.
    //
    input   = antlr3CreateFileStream(fileName);
    if  (input == NULL)
    {
        return NULL;
    }

    // We have the data in memory now so we can deal with it according to 
    // the encoding scheme we were given by the user.
    //
    input->encoding = encoding;

    // Now we need to work out the endian type and install any 
    // API functions that differ from 8Bit
    //
    setupInputStream(input);

    // Now we can set up the file name
    //	
    input->istream->streamName	= input->strFactory->newStr8(input->strFactory, fileName);
    input->fileName		= input->istream->streamName;

    return input;
}


ANTLR3_API pANTLR3_INPUT_STREAM
antlr3StringStreamNew(pANTLR3_UINT8 data, ANTLR3_UINT32 encoding, ANTLR3_UINT32 size, pANTLR3_UINT8 name)
{
    pANTLR3_INPUT_STREAM    input;

    // First order of business is to set up the stream and install the data pointer.
    // Then we will work out the encoding and byte order and adjust the API functions that are installed for the
    // default 8Bit stream accordingly.
    //
    input   = antlr3CreateStringStream(data);
    if  (input == NULL)
    {
        return NULL;
    }
    
    // Size (in bytes) of the given 'string'
    //
    input->sizeBuf		= size;

    // We have the data in memory now so we can deal with it according to 
    // the encoding scheme we were given by the user.
    //
    input->encoding = encoding;

    // Now we need to work out the endian type and install any 
    // API functions that differ from 8Bit
    //
    setupInputStream(input);

    // Now we can set up the file name
    //	
    input->istream->streamName	= input->strFactory->newStr8(input->strFactory, name);
    input->fileName		= input->istream->streamName;

    return input;
}


/// Determine endianess of the input stream and install the
/// API required for the encoding in that format.
///
static void 
setupInputStream(pANTLR3_INPUT_STREAM input)
{
    ANTLR3_BOOLEAN  isBigEndian;

    // Used to determine the endianness of the machine we are currently
    // running on.
    //
    ANTLR3_UINT16 bomTest = 0xFEFF;
    
    // What endianess is the machine we are running on? If the incoming
    // encoding endianess is the same as this machine's natural byte order
    // then we can use more efficient API calls.
    //
    if  (*((pANTLR3_UINT8)(&bomTest)) == 0xFE)
    {
        isBigEndian = ANTLR3_TRUE;
    }
    else
    {
        isBigEndian = ANTLR3_FALSE;
    }

    // What encoding did the user tell us {s}he thought it was? I am going
    // to get sick of the questions on antlr-interest, I know I am.
    //
    switch  (input->encoding)
    {
        case    ANTLR3_ENC_UTF8:

            // See if there is a BOM at the start of this UTF-8 sequence
            // and just eat it if there is. Windows .TXT files have this for instance
            // as it identifies UTF-8 even though it is of no consequence for byte order
            // as UTF-8 does not have a byte order.
            //
            if  (       (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar))      == 0xEF
                    &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0xBB
                    &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+2))    == 0xBF
                )
            {
                // The UTF8 BOM is present so skip it
                //
                input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 3);
            }

            // Install the UTF8 input routines
            //
            antlr3UTF8SetupStream(input);
            break;

        case    ANTLR3_ENC_UTF16:

            // See if there is a BOM at the start of the input. If not then
            // we assume that the byte order is the natural order of this
            // machine (or it is really UCS2). If there is a BOM we determine if the encoding
            // is the same as the natural order of this machine.
            //
            if  (       (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar))      == 0xFE
                    &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0xFF
                )
            {
                // BOM Present, indicates Big Endian
                //
                input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 2);

                antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_TRUE);
            }
            else if  (      (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar))      == 0xFF
                        &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0xFE
                )
            {
                // BOM present, indicates Little Endian
                //
                input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 2);

                antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_FALSE);
            }
            else
            {
                // No BOM present, assume local computer byte order
                //
                antlr3UTF16SetupStream(input, isBigEndian, isBigEndian);
            }
            break;

        case    ANTLR3_ENC_UTF32:

            // See if there is a BOM at the start of the input. If not then
            // we assume that the byte order is the natural order of this
            // machine. If there is we determine if the encoding
            // is the same as the natural order of this machine.
            //
            if  (       (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar))      == 0x00
                    &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0x00
                    &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+2))    == 0xFE
                    &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+3))    == 0xFF
                )
            {
                // BOM Present, indicates Big Endian
                //
                input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 4);

                antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_TRUE);
            }
            else if  (      (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar))      == 0xFF
                        &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0xFE
                        &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0x00
                        &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0x00
                )
            {
                // BOM present, indicates Little Endian
                //
                input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 4);

                antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_FALSE);
            }
            else
            {
                // No BOM present, assume local computer byte order
                //
                antlr3UTF32SetupStream(input, isBigEndian, isBigEndian);
            }
            break;

        case    ANTLR3_ENC_UTF16BE:

            // Encoding is definately Big Endian with no BOM
            //
            antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_TRUE);
            break;

        case    ANTLR3_ENC_UTF16LE:

            // Encoding is definately Little Endian with no BOM
            //
            antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_FALSE);
            break;

        case    ANTLR3_ENC_UTF32BE:

            // Encoding is definately Big Endian with no BOM
            //
            antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_TRUE);
            break;

        case    ANTLR3_ENC_UTF32LE:

            // Encoding is definately Little Endian with no BOM
            //
            antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_FALSE);
            break;

        case    ANTLR3_ENC_EBCDIC:

            // EBCDIC is basically the same as ASCII but with an on the
            // fly translation to ASCII
            //
            antlr3EBCDICSetupStream(input);
            break;

        case    ANTLR3_ENC_8BIT:
        default:

            // Standard 8bit/ASCII
            //
            antlr38BitSetupStream(input);
            break;
    }    
}

/** \brief Use the contents of an operating system file as the input
 *         for an input stream.
 *
 * \param fileName Name of operating system file to read.
 * \return
 *	- Pointer to new input stream context upon success
 *	- One of the ANTLR3_ERR_ defines on error.
 */
static pANTLR3_INPUT_STREAM
antlr3CreateFileStream(pANTLR3_UINT8 fileName)
{
	// Pointer to the input stream we are going to create
	//
	pANTLR3_INPUT_STREAM    input;
	ANTLR3_UINT32	    status;

	if	(fileName == NULL)
	{
		return NULL;
	}

	// Allocate memory for the input stream structure
	//
	input   = (pANTLR3_INPUT_STREAM)
		ANTLR3_CALLOC(1, sizeof(ANTLR3_INPUT_STREAM));

	if	(input == NULL)
	{
		return	NULL;
	}

	// Structure was allocated correctly, now we can read the file.
	//
	status  = antlr3read8Bit(input, fileName);

	// Call the common 8 bit input stream handler
	// initialization.
	//
	antlr3GenericSetupStream(input);

        // However if the file was not there or something then we
        // need to close. Have to wait until here as we cannot call
        // close until the API is installed of course.
        // 
	if	(status != ANTLR3_SUCCESS)
	{
		input->close(input);
		return	NULL;
	}

	return  input;
}

ANTLR3_API ANTLR3_UINT32
antlr3read8Bit(pANTLR3_INPUT_STREAM    input, pANTLR3_UINT8 fileName)
{
	ANTLR3_FDSC	    infile;
	ANTLR3_UINT32	    fSize;

	/* Open the OS file in read binary mode
	*/
	infile  = antlr3Fopen(fileName, "rb");

	/* Check that it was there
	*/
	if	(infile == NULL)
	{
		return	(ANTLR3_UINT32)ANTLR3_ERR_NOFILE;
	}

	/* It was there, so we can read the bytes now
	*/
	fSize   = antlr3Fsize(fileName);	/* Size of input file	*/

	/* Allocate buffer for this input set   
	*/
	input->data	    = ANTLR3_MALLOC((size_t)fSize);
	input->sizeBuf  = fSize;

	if	(input->data == NULL)
	{
		return	(ANTLR3_UINT32)ANTLR3_ERR_NOMEM;
	}

	input->isAllocated	= ANTLR3_TRUE;

	/* Now we read the file. Characters are not converted to
	* the internal ANTLR encoding until they are read from the buffer
	*/
	antlr3Fread(infile, fSize, input->data);

	/* And close the file handle
	*/
	antlr3Fclose(infile);

	return  ANTLR3_SUCCESS;
}

/** \brief Open an operating system file and return the descriptor
 * We just use the common open() and related functions here. 
 * Later we might find better ways on systems
 * such as Windows and OpenVMS for instance. But the idea is to read the 
 * while file at once anyway, so it may be irrelevant.
 */
ANTLR3_API ANTLR3_FDSC
antlr3Fopen(pANTLR3_UINT8 filename, const char * mode)
{
    return  (ANTLR3_FDSC)fopen((const char *)filename, mode);
}

/** \brief Close an operating system file and free any handles
 *  etc.
 */
ANTLR3_API void
antlr3Fclose(ANTLR3_FDSC fd)
{
    fclose(fd);
}
ANTLR3_API ANTLR3_UINT32
antlr3Fsize(pANTLR3_UINT8 fileName)
{   
    struct _stat	statbuf;

    _stat((const char *)fileName, &statbuf);

    return (ANTLR3_UINT32)statbuf.st_size;
}

ANTLR3_API ANTLR3_UINT32
antlr3Fread(ANTLR3_FDSC fdsc, ANTLR3_UINT32 count,  void * data)
{
    return  (ANTLR3_UINT32)fread(data, (size_t)count, 1, fdsc);
}


/** \brief Use the supplied 'string' as input to the stream
 *
 * \param data Pointer to the input data
 * \return
 *	- Pointer to new input stream context upon success
 *	- NULL defines on error.
 */
static pANTLR3_INPUT_STREAM
antlr3CreateStringStream(pANTLR3_UINT8 data)
{
	// Pointer to the input stream we are going to create
	//
	pANTLR3_INPUT_STREAM    input;

	if	(data == NULL)
	{
		return NULL;
	}

	// Allocate memory for the input stream structure
	//
	input   = (pANTLR3_INPUT_STREAM)
		ANTLR3_CALLOC(1, sizeof(ANTLR3_INPUT_STREAM));

	if	(input == NULL)
	{
		return	NULL;
	}

	// Structure was allocated correctly, now we can install the pointer
	//
        input->data             = data;
        input->isAllocated	= ANTLR3_FALSE;

	// Call the common 8 bit input stream handler
	// initialization.
	//
	antlr3GenericSetupStream(input);

        return  input;
}