#include "util/neo_misc.h"
#include "util/neo_err.h"
#include "util/neo_str.h"
#include "html.h"
#include "cgi.h"
static int has_space_formatting(const char *src, int slen)
{
int spaces = 0;
int returns = 0;
int ascii_art = 0;
int x = 0;
for (x = 0; x < slen; x++)
{
if (src[x] == '\t') return 1;
if (src[x] == ' ')
{
spaces++;
if (x && (src[x-1] == '.'))
spaces--;
}
else if (src[x] == '\n')
{
spaces = 0;
returns++;
}
else if (strchr ("/\\<>:[]!@#$%^&*()|", src[x]))
{
ascii_art++;
if (ascii_art > 3) return 2;
}
else if (src[x] != '\r')
{
if (returns > 2) return 1;
if (spaces > 2) return 1;
returns = 0;
spaces = 0;
ascii_art = 0;
}
}
return 0;
}
/*
static int has_long_lines (char *s, int l)
{
char *ptr;
int x = 0;
while (x < l)
{
ptr = strchr (s + x, '\n');
if (ptr == NULL)
{
if (l - x > 75) return 1;
return 0;
}
if (ptr - (s + x) > 75) return 1;
x = ptr - s + 1;
}
return 0;
}
*/
/* The first step is to actually find all of the URLs and email
* addresses using our handy regular expressions. We then mark these,
* and then go through convert non-special areas with straight
* text->html escapes, and convert special parts as special parts
*/
struct _parts {
int begin;
int end;
int type;
};
#define SC_TYPE_TEXT 1
#define SC_TYPE_URL 2
#define SC_TYPE_EMAIL 3
static char *EmailRe = "[^][@:;<>\\\"()[:space:][:cntrl:]]+@[-+a-zA-Z0-9]+\\.[-+a-zA-Z0-9\\.]+[-+a-zA-Z0-9]";
static char *URLRe = "((http|https|ftp|mailto):(//)?[^[:space:]>\"\t]*|www\\.[-a-z0-9\\.]+)[^[:space:];\t\">]*";
static NEOERR *split_and_convert (const char *src, int slen,
STRING *out, HTML_CONVERT_OPTS *opts)
{
NEOERR *err = STATUS_OK;
static int compiled = 0;
static regex_t email_re, url_re;
regmatch_t email_match, url_match;
int errcode;
char *ptr, *esc;
char errbuf[256];
struct _parts *parts;
int part_count;
int part;
int x, i;
int spaces = 0;
if (!compiled)
{
if ((errcode = regcomp (&email_re, EmailRe, REG_ICASE | REG_EXTENDED)))
{
regerror (errcode, &email_re, errbuf, sizeof(errbuf));
return nerr_raise (NERR_PARSE, "Unable to compile EmailRE: %s", errbuf);
}
if ((errcode = regcomp (&url_re, URLRe, REG_ICASE | REG_EXTENDED)))
{
regerror (errcode, &url_re, errbuf, sizeof(errbuf));
return nerr_raise (NERR_PARSE, "Unable to compile URLRe: %s", errbuf);
}
compiled = 1;
}
part_count = 20;
parts = (struct _parts *) malloc (sizeof(struct _parts) * part_count);
part = 0;
x = 0;
if (regexec (&email_re, src+x, 1, &email_match, 0) != 0)
{
email_match.rm_so = -1;
email_match.rm_eo = -1;
}
else
{
email_match.rm_so += x;
email_match.rm_eo += x;
}
if (regexec (&url_re, src+x, 1, &url_match, 0) != 0)
{
url_match.rm_so = -1;
url_match.rm_eo = -1;
}
else
{
url_match.rm_so += x;
url_match.rm_eo += x;
}
while ((x < slen) && !((email_match.rm_so == -1) && (url_match.rm_so == -1)))
{
if (part >= part_count)
{
part_count *= 2;
parts = (struct _parts *) realloc (parts, sizeof(struct _parts) * part_count);
}
if ((url_match.rm_so != -1) && ((email_match.rm_so == -1) || (url_match.rm_so <= email_match.rm_so)))
{
parts[part].begin = url_match.rm_so;
parts[part].end = url_match.rm_eo;
parts[part].type = SC_TYPE_URL;
x = parts[part].end + 1;
part++;
if (x < slen)
{
if (regexec (&url_re, src+x, 1, &url_match, 0) != 0)
{
url_match.rm_so = -1;
url_match.rm_eo = -1;
}
else
{
url_match.rm_so += x;
url_match.rm_eo += x;
}
if ((email_match.rm_so != -1) && (x > email_match.rm_so))
{
if (regexec (&email_re, src+x, 1, &email_match, 0) != 0)
{
email_match.rm_so = -1;
email_match.rm_eo = -1;
}
else
{
email_match.rm_so += x;
email_match.rm_eo += x;
}
}
}
}
else
{
parts[part].begin = email_match.rm_so;
parts[part].end = email_match.rm_eo;
parts[part].type = SC_TYPE_EMAIL;
x = parts[part].end + 1;
part++;
if (x < slen)
{
if (regexec (&email_re, src+x, 1, &email_match, 0) != 0)
{
email_match.rm_so = -1;
email_match.rm_eo = -1;
}
else
{
email_match.rm_so += x;
email_match.rm_eo += x;
}
if ((url_match.rm_so != -1) && (x > url_match.rm_so))
{
if (regexec (&url_re, src+x, 1, &url_match, 0) != 0)
{
url_match.rm_so = -1;
url_match.rm_eo = -1;
}
else
{
url_match.rm_so += x;
url_match.rm_eo += x;
}
}
}
}
}
i = 0;
x = 0;
while (x < slen)
{
if ((i >= part) || (x < parts[i].begin))
{
ptr = strpbrk(src + x, "&<>\r\n ");
if (ptr == NULL)
{
if (spaces)
{
int sp;
for (sp = 0; sp < spaces - 1; sp++)
{
err = string_append (out, " ");
if (err != STATUS_OK) break;
}
if (err != STATUS_OK) break;
err = string_append_char (out, ' ');
}
spaces = 0;
if (i < part)
{
err = string_appendn (out, src + x, parts[i].begin - x);
x = parts[i].begin;
}
else
{
err = string_append (out, src + x);
x = slen;
}
}
else
{
if ((i >= part) || ((ptr - src) < parts[i].begin))
{
if (spaces)
{
int sp;
for (sp = 0; sp < spaces - 1; sp++)
{
err = string_append (out, " ");
if (err != STATUS_OK) break;
}
if (err != STATUS_OK) break;
err = string_append_char (out, ' ');
}
spaces = 0;
err = string_appendn (out, src + x, (ptr - src) - x);
if (err != STATUS_OK) break;
x = ptr - src;
if (src[x] == ' ')
{
if (opts->space_convert)
{
spaces++;
}
else
err = string_append_char (out, ' ');
}
else
{
if (src[x] != '\n' && spaces)
{
int sp;
for (sp = 0; sp < spaces - 1; sp++)
{
err = string_append (out, " ");
if (err != STATUS_OK) break;
}
if (err != STATUS_OK) break;
err = string_append_char (out, ' ');
}
spaces = 0;
if (src[x] == '&')
err = string_append (out, "&");
else if (src[x] == '<')
err = string_append (out, "<");
else if (src[x] == '>')
err = string_append (out, ">");
else if (src[x] == '\n')
if (opts->newlines_convert)
err = string_append (out, "
\n");
else if (x && src[x-1] == '\n')
err = string_append (out, "\n");
else
err = string_append_char (out, '\n');
else if (src[x] != '\r')
err = nerr_raise (NERR_ASSERT, "src[x] == '%c'", src[x]);
}
x++;
}
else
{
if (spaces)
{
int sp;
for (sp = 0; sp < spaces - 1; sp++)
{
err = string_append (out, " ");
if (err != STATUS_OK) break;
}
if (err != STATUS_OK) break;
err = string_append_char (out, ' ');
}
spaces = 0;
err = string_appendn (out, src + x, parts[i].begin - x);
x = parts[i].begin;
}
}
}
else
{
if (spaces)
{
int sp;
for (sp = 0; sp < spaces - 1; sp++)
{
err = string_append (out, " ");
if (err != STATUS_OK) break;
}
if (err != STATUS_OK) break;
err = string_append_char (out, ' ');
}
spaces = 0;
if (parts[i].type == SC_TYPE_URL)
{
char last_char = src[parts[i].end-1];
int suffix=0;
if (last_char == '.' || last_char == ',') { suffix=1; }
err = string_append (out, " url_class)
{
err = string_appendf (out, "class=%s ", opts->url_class);
if (err) break;
}
if (opts->url_target)
{
err = string_appendf (out, "target=\"%s\" ", opts->url_target);
if (err) break;
}
err = string_append(out, "href=\"");
if (err) break;
if (opts->bounce_url)
{
char *url, *esc_url, *new_url;
int url_len;
if (!strncasecmp(src + x, "www.", 4))
{
url_len = 7 + parts[i].end - x - suffix;
url = (char *) malloc(url_len+1);
if (url == NULL)
{
err = nerr_raise(NERR_NOMEM,
"Unable to allocate memory to convert url");
break;
}
strcpy(url, "http://");
strncat(url, src + x, parts[i].end - x - suffix);
}
else
{
url_len = parts[i].end - x - suffix;
url = (char *) malloc(url_len+1);
if (url == NULL)
{
err = nerr_raise(NERR_NOMEM,
"Unable to allocate memory to convert url");
break;
}
strncpy(url, src + x, parts[i].end - x - suffix);
url[url_len] = '\0';
}
err = cgi_url_escape(url, &esc_url);
free(url);
if (err) {
free(esc_url);
break;
}
new_url = sprintf_alloc(opts->bounce_url, esc_url);
free(esc_url);
if (new_url == NULL)
{
err = nerr_raise(NERR_NOMEM, "Unable to allocate memory to convert url");
break;
}
err = string_append (out, new_url);
free(new_url);
if (err) break;
}
else
{
if (!strncasecmp(src + x, "www.", 4))
{
err = string_append (out, "http://");
if (err != STATUS_OK) break;
}
err = string_appendn (out, src + x, parts[i].end - x - suffix);
if (err != STATUS_OK) break;
}
err = string_append (out, "\">");
if (err != STATUS_OK) break;
if (opts->link_name) {
err = html_escape_alloc((opts->link_name),
strlen(opts->link_name), &esc);
} else {
err = html_escape_alloc((src + x), parts[i].end - x - suffix, &esc);
}
if (err != STATUS_OK) break;
err = string_append (out, esc);
free(esc);
if (err != STATUS_OK) break;
err = string_append (out, "");
if (suffix) {
err = string_appendn(out,src + parts[i].end - 1,1);
if (err != STATUS_OK) break;
}
}
else /* type == SC_TYPE_EMAIL */
{
err = string_append (out, "mailto_class)
{
err = string_appendf (out, "class=%s ", opts->mailto_class);
if (err) break;
}
err = string_append(out, "href=\"mailto:");
if (err) break;
err = string_appendn (out, src + x, parts[i].end - x);
if (err != STATUS_OK) break;
err = string_append (out, "\">");
if (err != STATUS_OK) break;
err = html_escape_alloc(src + x, parts[i].end - x, &esc);
if (err != STATUS_OK) break;
err = string_append (out, esc);
free(esc);
if (err != STATUS_OK) break;
err = string_append (out, "");
}
x = parts[i].end;
i++;
}
if (err != STATUS_OK) break;
}
free (parts);
return err;
}
static void strip_white_space_end (STRING *str)
{
int x = 0;
int ol = str->len;
char *ptr;
int i;
while (x < str->len)
{
ptr = strchr(str->buf + x, '\n');
if (ptr == NULL)
{
/* just strip the white space at the end of the string */
ol = strlen(str->buf);
while (ol && isspace(str->buf[ol-1]))
{
str->buf[ol - 1] = '\0';
ol--;
}
str->len = ol;
return;
}
else
{
x = i = ptr - str->buf;
if (x)
{
x--;
while (x && isspace(str->buf[x]) && (str->buf[x] != '\n')) x--;
if (x) x++;
memmove (str->buf + x, ptr, ol - i + 1);
x++;
str->len -= ((i - x) + 1);
str->buf[str->len] = '\0';
ol = str->len;
}
}
}
}
NEOERR *convert_text_html_alloc (const char *src, int slen,
char **out)
{
return nerr_pass(convert_text_html_alloc_options(src, slen, out, NULL));
}
NEOERR *convert_text_html_alloc_options (const char *src, int slen,
char **out,
HTML_CONVERT_OPTS *opts)
{
NEOERR *err;
STRING out_s;
int formatting = 0;
HTML_CONVERT_OPTS my_opts;
string_init(&out_s);
if (opts == NULL)
{
opts = &my_opts;
opts->bounce_url = NULL;
opts->url_class = NULL;
opts->url_target = "_blank";
opts->mailto_class = NULL;
opts->long_lines = 0;
opts->space_convert = 0;
opts->newlines_convert = 1;
opts->longline_width = 75; /* This hasn't been used in a while, actually */
opts->check_ascii_art = 1;
opts->link_name = NULL;
}
do
{
if (opts->check_ascii_art)
{
formatting = has_space_formatting (src, slen);
if (formatting) opts->space_convert = 1;
}
if (formatting == 2)
{
/* Do formatting */
opts->newlines_convert = 1;
err = string_append (&out_s, "");
if (err != STATUS_OK) break;
err = split_and_convert(src, slen, &out_s, opts);
if (err != STATUS_OK) break;
err = string_append (&out_s, "");
if (err != STATUS_OK) break;
/* Strip white space at end of lines */
strip_white_space_end (&out_s);
}
else
{
/* int nl = has_long_lines (src, slen); */
err = split_and_convert(src, slen, &out_s, opts);
}
} while (0);
if (err != STATUS_OK)
{
string_clear (&out_s);
return nerr_pass (err);
}
if (out_s.buf == NULL)
{
*out = strdup("");
}
else
{
*out = out_s.buf;
}
return STATUS_OK;
}
NEOERR *html_escape_alloc (const char *src, int slen,
char **out)
{
return nerr_pass(neos_html_escape(src, slen, out));
}
/* Replace ampersand with iso-8859-1 character code */
static unsigned char _expand_amp_8859_1_char (const char *s)
{
if (s[0] == '\0')
return 0;
switch (s[0]) {
case '#':
if (s[1] == 'x') return strtol (s+2, NULL, 16);
return strtol (s+1, NULL, 10);
case 'a':
if (!strcmp(s, "agrave")) return 0xe0; /* */
if (!strcmp(s, "aacute")) return 0xe1; /* */
if (!strcmp(s, "acirc")) return 0xe2; /* */
if (!strcmp(s, "atilde")) return 0xe3; /* */
if (!strcmp(s, "auml")) return 0xe4; /* */
if (!strcmp(s, "aring")) return 0xe5; /* */
if (!strcmp(s, "aelig")) return 0xe6; /* */
if (!strcmp(s, "amp")) return '&';
return 0;
case 'c':
if (!strcmp(s, "ccedil")) return 0xe7; /* */
return 0;
case 'e':
if (!strcmp(s, "egrave")) return 0xe8; /* */
if (!strcmp(s, "eacute")) return 0xe9; /* */
if (!strcmp(s, "ecirc")) return 0xea; /* */
if (!strcmp(s, "euml")) return 0xeb; /* */
if (!strcmp(s, "eth")) return 0xf0; /* */
return 0;
case 'i':
if (!strcmp(s, "igrave")) return 0xec; /* */
if (!strcmp(s, "iacute")) return 0xed; /* */
if (!strcmp(s, "icirc")) return 0xee; /* */
if (!strcmp(s, "iuml")) return 0xef; /* */
return 0;
case 'g':
if (!strcmp(s, "gt")) return '>';
return 0;
case 'l':
if (!strcmp(s, "lt")) return '<';
return 0;
case 'n':
if (!strcmp(s, "ntilde")) return 0xf1; /* */
if (!strcmp(s, "nbsp")) return ' ';
return 0;
case 'o':
if (!strcmp(s, "ograve")) return 0xf2; /* */
if (!strcmp(s, "oacute")) return 0xf3; /* */
if (!strcmp(s, "ocirc")) return 0xf4; /* */
if (!strcmp(s, "otilde")) return 0xf5; /* */
if (!strcmp(s, "ouml")) return 0xf6; /* */
if (!strcmp(s, "oslash")) return 0xf8; /* */
return 0;
case 'q': /* quot */
if (!strcmp(s, "quot")) return '"';
return 0;
case 's':
if (!strcmp(s, "szlig")) return 0xdf; /* */
return 0;
case 't':
if (!strcmp(s, "thorn")) return 0xfe; /* */
return 0;
case 'u':
if (!strcmp(s, "ugrave")) return 0xf9; /* */
if (!strcmp(s, "uacute")) return 0xfa; /* */
if (!strcmp(s, "ucirc")) return 0xfb; /* */
if (!strcmp(s, "uuml")) return 0xfc; /* */
return 0;
case 'y':
if (!strcmp(s, "yacute")) return 0xfd; /* */
}
return 0;
}
char *html_expand_amp_8859_1(const char *amp,
char *buf)
{
unsigned char ch;
ch = _expand_amp_8859_1_char(amp);
if (ch == '\0')
{
if (!strcmp(amp, "copy")) return "(C)";
return "";
}
else {
buf[0] = (char)ch;
buf[1] = '\0';
return buf;
}
}
NEOERR *html_strip_alloc(const char *src, int slen,
char **out)
{
NEOERR *err = STATUS_OK;
STRING out_s;
int x = 0;
int strip_match = -1;
int state = 0;
char amp[10];
int amp_start = 0;
char buf[10];
int ampl = 0;
string_init(&out_s);
err = string_append (&out_s, "");
if (err) return nerr_pass (err);
while (x < slen)
{
switch (state) {
case 0:
/* Default */
if (src[x] == '&')
{
state = 3;
ampl = 0;
amp_start = x;
}
else if (src[x] == '<')
{
state = 1;
}
else
{
if (strip_match == -1)
{
err = string_append_char(&out_s, src[x]);
if (err) break;
}
}
x++;
break;
case 1:
/* Starting TAG */
if (src[x] == '>')
{
state = 0;
}
else if (src[x] == '/')
{
}
else
{
}
x++;
break;
case 2:
/* In TAG */
if (src[x] == '>')
{
state = 0;
}
x++;
break;
case 3:
/* In AMP */
if (src[x] == ';')
{
amp[ampl] = '\0';
state = 0;
err = string_append(&out_s, html_expand_amp_8859_1(amp, buf));
if (err) break;
}
else
{
if (ampl < sizeof(amp)-1)
amp[ampl++] = tolower(src[x]);
else
{
/* broken html... just back up */
x = amp_start;
err = string_append_char(&out_s, src[x]);
if (err) break;
state = 0;
}
}
x++;
break;
}
if (err) break;
}
if (err)
{
string_clear (&out_s);
return nerr_pass (err);
}
*out = out_s.buf;
return STATUS_OK;
}