/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: ucdmerge.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003feb20
* created by: Markus W. Scherer
*
* Simple tool for Unicode Character Database files with semicolon-delimited fields.
* Merges adjacent, identical per-code point data lines into one line with range syntax.
*
* To compile, just call a C compiler/linker with this source file.
* On Windows: cl ucdmerge.c
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
static const char *
skipWhitespace(const char *s) {
while(*s==' ' || *s=='\t') {
++s;
}
return s;
}
/* return the first character position after the end of the data */
static char *
endOfData(const char *l) {
char *end;
char c;
end=strchr(l, '#');
if(end!=NULL) {
/* ignore whitespace before the comment */
while(l!=end && ((c=*(end-1))==' ' || c=='\t')) {
--end;
}
} else {
end=strchr(l, 0);
}
return end;
}
static int
sameData(const char *l1, const char *l2) {
char *end1, *end2;
int length;
/* find the first semicolon in each line - there must be one */
l1=strchr(l1, ';')+1;
l2=strchr(l2, ';')+1;
/* find the end of data: end of string or start of comment */
end1=endOfData(l1);
end2=endOfData(l2);
/* compare the line data portions */
length=end1-l1;
return length==(end2-l2) && 0==memcmp(l1, l2, length);
}
extern int
main(int argc, const char *argv[]) {
static char line[2000], firstLine[2000], lastLine[2000];
char *end;
long first, last, c;
int finished;
first=last=-1;
finished=0;
for(;;) {
if(gets(line)!=NULL) {
/* parse the initial code point, if any */
c=strtol(line, &end, 16);
if(end!=line && *skipWhitespace(end)==';') {
/* single code point followed by semicolon and data, keep c */
} else {
c=-1;
}
} else {
line[0]=0;
c=-1;
finished=1;
}
if(last>=0 && (c!=(last+1) || !sameData(firstLine, line))) {
/* output the current range */
if(first==last) {
/* there was no range, just output the one line we found */
puts(firstLine);
} else {
/* there was a real range, merge their lines */
end=strchr(lastLine, '#');
if(end==NULL) {
/* no comment in second line */
printf("%04lX..%04lX%s\n",
first, last, /* code point range */
strchr(firstLine, ';'));/* first line starting from the first ; */
} else if(strchr(firstLine, '#')==NULL) {
/* no comment in first line */
printf("%04lX..%04lX%s%s\n",
first, last, /* code point range */
strchr(firstLine, ';'), /* first line starting from the first ; */
end); /* comment from second line */
} else {
/* merge comments from both lines */
printf("%04lX..%04lX%s..%s\n",
first, last, /* code point range */
strchr(firstLine, ';'), /* first line starting from the first ; */
skipWhitespace(end+1)); /* comment from second line, after # and spaces */
}
}
first=last=-1;
}
if(c<0) {
if(finished) {
break;
}
/* no data on this line, output as is */
puts(line);
} else {
/* data on this line, store for possible range compaction */
if(last<0) {
/* set as the first line in a possible range */
first=last=c;
strcpy(firstLine, line);
lastLine[0]=0;
} else /* must be c==(last+1) && sameData() because of previous conditions */ {
/* continue with the current range */
last=c;
strcpy(lastLine, line);
}
}
}
return 0;
}