// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2000-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: uparse.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2000apr18
* created by: Markus W. Scherer
*
* This file provides a parser for files that are delimited by one single
* character like ';' or TAB. Example: the Unicode Character Properties files
* like UnicodeData.txt are semicolon-delimited.
*/
#ifndef __UPARSE_H__
#define __UPARSE_H__
#include "unicode/utypes.h"
/**
* Is c an invariant-character whitespace?
* @param c invariant character
*/
#define U_IS_INV_WHITESPACE(c) ((c)==' ' || (c)=='\t' || (c)=='\r' || (c)=='\n')
U_CDECL_BEGIN
/**
* Skip space ' ' and TAB '\t' characters.
*
* @param s Pointer to characters.
* @return Pointer to first character at or after s that is not a space or TAB.
*/
U_CAPI const char * U_EXPORT2
u_skipWhitespace(const char *s);
/**
* Trim whitespace (including line endings) from the end of the string.
*
* @param s Pointer to the string.
* @return Pointer to the new end of the string.
*/
U_CAPI char * U_EXPORT2
u_rtrim(char *s);
/** Function type for u_parseDelimitedFile(). */
typedef void U_CALLCONV
UParseLineFn(void *context,
char *fields[][2],
int32_t fieldCount,
UErrorCode *pErrorCode);
/**
* Parser for files that are similar to UnicodeData.txt:
* This function opens the file and reads it line by line. It skips empty lines
* and comment lines that start with a '#'.
* All other lines are separated into fields with one delimiter character
* (semicolon for Unicode Properties files) between two fields. The last field in
* a line does not need to be terminated with a delimiter.
*
* For each line, after segmenting it, a line function is called.
* It gets passed the array of field start and limit pointers that is
* passed into this parser and filled by it for each line.
* For each field i of the line, the start pointer in fields[i][0]
* points to the beginning of the field, while the limit pointer in fields[i][1]
* points behind the field, i.e., to the delimiter or the line end.
*
* The context parameter of the line function is
* the same as the one for the parse function.
*
* The line function may modify the contents of the fields including the
* limit characters.
*
* If the file cannot be opened, or there is a parsing error or a field function
* sets *pErrorCode, then the parser returns with *pErrorCode set to an error code.
*/
U_CAPI void U_EXPORT2
u_parseDelimitedFile(const char *filename, char delimiter,
char *fields[][2], int32_t fieldCount,
UParseLineFn *lineFn, void *context,
UErrorCode *pErrorCode);
/**
* Parse a string of code points like 0061 0308 0300.
* s must end with either ';' or NUL.
*
* @return Number of code points.
*/
U_CAPI int32_t U_EXPORT2
u_parseCodePoints(const char *s,
uint32_t *dest, int32_t destCapacity,
UErrorCode *pErrorCode);
/**
* Parse a list of code points like 0061 0308 0300
* into a UChar * string.
* s must end with either ';' or NUL.
*
* Set the first code point in *pFirst.
*
* @param s Input char * string.
* @param dest Output string buffer.
* @param destCapacity Capacity of dest in numbers of UChars.
* @param pFirst If pFirst!=NULL the *pFirst will be set to the first
* code point in the string.
* @param pErrorCode ICU error code.
* @return The length of the string in numbers of UChars.
*/
U_CAPI int32_t U_EXPORT2
u_parseString(const char *s,
UChar *dest, int32_t destCapacity,
uint32_t *pFirst,
UErrorCode *pErrorCode);
/**
* Parse a code point range like
* 0085 or
* 4E00..9FA5.
*
* s must contain such a range and end with either ';' or NUL.
*
* @return Length of code point range, end-start+1
*/
U_CAPI int32_t U_EXPORT2
u_parseCodePointRange(const char *s,
uint32_t *pStart, uint32_t *pEnd,
UErrorCode *pErrorCode);
/**
* Same as u_parseCodePointRange() but the range may be terminated by
* any character. The position of the terminating character is returned via
* the *terminator output parameter.
*/
U_CAPI int32_t U_EXPORT2
u_parseCodePointRangeAnyTerminator(const char *s,
uint32_t *pStart, uint32_t *pEnd,
const char **terminator,
UErrorCode *pErrorCode);
U_CAPI int32_t U_EXPORT2
u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status);
U_CDECL_END
#endif