C++程序  |  589行  |  14.9 KB

/* `L' command implementation for GNU sed, based on GNU fmt 1.22.
   Copyright (C) 1994, 1995, 1996, 2002, 2003 Free Software Foundation, Inc.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software Foundation,
   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */

/* GNU fmt was written by Ross Paterson <rap@doc.ic.ac.uk>.  */

#include "sed.h"

#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <sys/types.h>

#if HAVE_LIMITS_H
# include <limits.h>
#endif

#ifndef UINT_MAX
# define UINT_MAX ((unsigned int) ~(unsigned int) 0)
#endif

#ifndef INT_MAX
# define INT_MAX ((int) (UINT_MAX >> 1))
#endif

/* The following parameters represent the program's idea of what is
   "best".  Adjust to taste, subject to the caveats given.  */

/* Prefer lines to be LEEWAY % shorter than the maximum width, giving
   room for optimization.  */
#define	LEEWAY	7

/* Costs and bonuses are expressed as the equivalent departure from the
   optimal line length, multiplied by 10.  e.g. assigning something a
   cost of 50 means that it is as bad as a line 5 characters too short
   or too long.  The definition of SHORT_COST(n) should not be changed.
   However, EQUIV(n) may need tuning.  */

typedef long COST;

#define	MAXCOST	(~(((unsigned long) 1) << (8 * sizeof (COST) -1)))

#define	SQR(n)		((n) * (n))
#define	EQUIV(n)	SQR ((COST) (n))

/* Cost of a filled line n chars longer or shorter than best_width.  */
#define	SHORT_COST(n)	EQUIV ((n) * 10)

/* Cost of the difference between adjacent filled lines.  */
#define	RAGGED_COST(n)	(SHORT_COST (n) / 2)

/* Basic cost per line.  */
#define	LINE_COST	EQUIV (70)

/* Cost of breaking a line after the first word of a sentence, where
   the length of the word is N.  */
#define	WIDOW_COST(n)	(EQUIV (200) / ((n) + 2))

/* Cost of breaking a line before the last word of a sentence, where
   the length of the word is N.  */
#define	ORPHAN_COST(n)	(EQUIV (150) / ((n) + 2))

/* Bonus for breaking a line at the end of a sentence.  */
#define	SENTENCE_BONUS	EQUIV (50)

/* Cost of breaking a line after a period not marking end of a sentence.
   With the definition of sentence we are using (borrowed from emacs, see
   get_line()) such a break would then look like a sentence break.  Hence
   we assign a very high cost -- it should be avoided unless things are
   really bad.  */
#define	NOBREAK_COST	EQUIV (600)

/* Bonus for breaking a line before open parenthesis.  */
#define	PAREN_BONUS	EQUIV (40)

/* Bonus for breaking a line after other punctuation.  */
#define	PUNCT_BONUS	EQUIV(40)

/* Credit for breaking a long paragraph one line later.  */
#define	LINE_CREDIT	EQUIV(3)

/* Size of paragraph buffer in words.  Longer paragraphs are handled
   neatly (cf. flush_paragraph()), so there's little to gain by making
   these larger.  */
#define	MAXWORDS	1000

#define GETC()          (parabuf == end_of_parabuf ? EOF : *parabuf++)

/* Extra ctype(3)-style macros.  */

#define	isopen(c)	(strchr ("([`'\"", (c)) != NULL)
#define	isclose(c)	(strchr (")]'\"", (c)) != NULL)
#define	isperiod(c)	(strchr (".?!", (c)) != NULL)

/* Size of a tab stop, for expansion on input and re-introduction on
   output.  */
#define	TABWIDTH	8

/* Word descriptor structure.  */

typedef struct Word WORD;

struct Word
  {

    /* Static attributes determined during input.  */

    const char *text;		/* the text of the word */
    short length;		/* length of this word */
    short space;		/* the size of the following space */
    unsigned paren:1;		/* starts with open paren */
    unsigned period:1;		/* ends in [.?!])* */
    unsigned punct:1;		/* ends in punctuation */
    unsigned final:1;		/* end of sentence */

    /* The remaining fields are computed during the optimization.  */

    short line_length;		/* length of the best line starting here */
    COST best_cost;		/* cost of best paragraph starting here */
    WORD *next_break;		/* break which achieves best_cost */
  };

/* Forward declarations.  */

static bool get_paragraph P_ ((void));
static int get_line P_ ((int c));
static int get_space P_ ((int c));
static int copy_rest P_ ((int c));
static bool same_para P_ ((int c));
static void flush_paragraph P_ ((void));
static void fmt_paragraph P_ ((void));
static void check_punctuation P_ ((WORD *w));
static COST base_cost P_ ((WORD *this));
static COST line_cost P_ ((WORD *next, int len));
static void put_paragraph P_ ((WORD *finish));
static void put_line P_ ((WORD *w, int indent));
static void put_word P_ ((WORD *w));
static void put_space P_ ((int space));

/* Option values.  */

/* User-supplied maximum line width (default WIDTH).  The only output
   lines
   longer than this will each comprise a single word.  */
static int max_width;

/* Space for the paragraph text.  */
static const char *parabuf;

/* End of space for the paragraph text.  */
static const char *end_of_parabuf;

/* The file on which we output */
static FILE *outfile;

/* Values derived from the option values.  */

/* The preferred width of text lines, set to LEEWAY % less than max_width.  */
static int best_width;

/* Dynamic variables.  */

/* Start column of the character most recently read from the input file.  */
static int in_column;

/* Start column of the next character to be written to stdout.  */
static int out_column;

/* The words of a paragraph -- longer paragraphs are handled neatly
   (cf. flush_paragraph()).  */
static WORD words[MAXWORDS];

/* A pointer into the above word array, indicating the first position
   after the last complete word.  Sometimes it will point at an incomplete
   word.  */
static WORD *word_limit;

/* Indentation of the first line of the current paragraph.  */
static int first_indent;

/* Indentation of other lines of the current paragraph */
static int other_indent;

/* The last character read from the input file.  */
static int next_char;

/* If nonzero, the length of the last line output in the current
   paragraph, used to charge for raggedness at the split point for long
   paragraphs chosen by fmt_paragraph().  */
static int last_line_length;

/* read file F and send formatted output to stdout.  */

void
fmt (const char *line, const char *line_end, int max_length, FILE *output_file)
{
  parabuf = line;
  end_of_parabuf = line_end;
  outfile = output_file;

  max_width = max_length;
  best_width = max_width * (201 - 2 * LEEWAY) / 200;

  in_column = 0;
  other_indent = 0;
  next_char = GETC();
  while (get_paragraph ())
    {
      fmt_paragraph ();
      put_paragraph (word_limit);
    }
}

/* Read a paragraph from input file F.  A paragraph consists of a
   maximal number of non-blank (excluding any prefix) lines
   with the same indent.

   Return false if end-of-file was encountered before the start of a
   paragraph, else true.  */

static bool
get_paragraph ()
{
  register int c;

  last_line_length = 0;
  c = next_char;

  /* Scan (and copy) blank lines, and lines not introduced by the prefix.  */

  while (c == '\n' || c == EOF)
    {
      c = copy_rest (c);
      if (c == EOF)
	{
	  next_char = EOF;
	  return false;
	}
      putc ('\n', outfile);
      c = GETC();
    }

  /* Got a suitable first line for a paragraph.  */

  first_indent = in_column;
  word_limit = words;
  c = get_line (c);

  /* Read rest of paragraph.  */

  other_indent = in_column;
  while (same_para (c) && in_column == other_indent)
    c = get_line (c);

  (word_limit - 1)->period = (word_limit - 1)->final = true;
  next_char = c;
  return true;
}

/* Copy to the output a blank line.  In the latter, C is \n or EOF.
   Return the character (\n or EOF) ending the line.  */

static int
copy_rest (register int c)
{
  out_column = 0;
  while (c != '\n' && c != EOF)
    {
      putc (c, outfile);
      c = GETC();
    }
  return c;
}

/* Return true if a line whose first non-blank character after the
   prefix (if any) is C could belong to the current paragraph,
   otherwise false.  */

static bool
same_para (register int c)
{
  return (c != '\n' && c != EOF);
}

/* Read a line from the input data given first non-blank character C
   after the prefix, and the following indent, and break it into words.
   A word is a maximal non-empty string of non-white characters.  A word
   ending in [.?!]["')\]]* and followed by end-of-line or at least two
   spaces ends a sentence, as in emacs.

   Return the first non-blank character of the next line.  */

static int
get_line (register int c)
{
  int start;
  register WORD *end_of_word;

  end_of_word = &words[MAXWORDS - 2];

  do
    {				/* for each word in a line */

      /* Scan word.  */

      word_limit->text = parabuf - 1;
      do
	c = GETC();
      while (c != EOF && !ISSPACE (c));
      word_limit->length = parabuf - word_limit->text - (c != EOF);
      in_column += word_limit->length;

      check_punctuation (word_limit);

      /* Scan inter-word space.  */

      start = in_column;
      c = get_space (c);
      word_limit->space = in_column - start;
      word_limit->final = (c == EOF
			   || (word_limit->period
			       && (c == '\n' || word_limit->space > 1)));
      if (c == '\n' || c == EOF)
	word_limit->space = word_limit->final ? 2 : 1;
      if (word_limit == end_of_word)
	flush_paragraph ();
      word_limit++;
      if (c == EOF)
	{
	  in_column = first_indent;
	  return EOF;
	}
    }
  while (c != '\n');

  in_column = 0;
  c = GETC();
  return get_space (c);
}

/* Read blank characters from the input data, starting with C, and keeping
   in_column up-to-date.  Return first non-blank character.  */

static int
get_space (register int c)
{
  for (;;)
    {
      if (c == ' ')
	in_column++;
      else if (c == '\t')
	in_column = (in_column / TABWIDTH + 1) * TABWIDTH;
      else
	return c;
      c = GETC();
    }
}

/* Set extra fields in word W describing any attached punctuation.  */

static void
check_punctuation (register WORD *w)
{
  register const char *start, *finish;

  start = w->text;
  finish = start + (w->length - 1);
  w->paren = isopen (*start);
  w->punct = ISPUNCT (*finish);
  while (isclose (*finish) && finish > start)
    finish--;
  w->period = isperiod (*finish);
}

/* Flush part of the paragraph to make room.  This function is called on
   hitting the limit on the number of words or characters.  */

static void
flush_paragraph (void)
{
  WORD *split_point;
  register WORD *w;
  COST best_break;

  /* - format what you have so far as a paragraph,
     - find a low-cost line break near the end,
     - output to there,
     - make that the start of the paragraph.  */

  fmt_paragraph ();

  /* Choose a good split point.  */

  split_point = word_limit;
  best_break = MAXCOST;
  for (w = words->next_break; w != word_limit; w = w->next_break)
    {
      if (w->best_cost - w->next_break->best_cost < best_break)
	{
	  split_point = w;
	  best_break = w->best_cost - w->next_break->best_cost;
	}
      if (best_break <= MAXCOST - LINE_CREDIT)
	best_break += LINE_CREDIT;
    }
  put_paragraph (split_point);

  /* Copy words from split_point down to word -- we use memmove because
     the source and target may overlap.  */

  memmove ((char *) words, (char *) split_point,
	 (word_limit - split_point + 1) * sizeof (WORD));
  word_limit -= split_point - words;
}

/* Compute the optimal formatting for the whole paragraph by computing
   and remembering the optimal formatting for each suffix from the empty
   one to the whole paragraph.  */

static void
fmt_paragraph (void)
{
  register WORD *start, *w;
  register int len;
  register COST wcost, best;
  int saved_length;

  word_limit->best_cost = 0;
  saved_length = word_limit->length;
  word_limit->length = max_width;	/* sentinel */

  for (start = word_limit - 1; start >= words; start--)
    {
      best = MAXCOST;
      len = start == words ? first_indent : other_indent;

      /* At least one word, however long, in the line.  */

      w = start;
      len += w->length;
      do
	{
	  w++;

	  /* Consider breaking before w.  */

	  wcost = line_cost (w, len) + w->best_cost;
	  if (start == words && last_line_length > 0)
	    wcost += RAGGED_COST (len - last_line_length);
	  if (wcost < best)
	    {
	      best = wcost;
	      start->next_break = w;
	      start->line_length = len;
	    }
	  len += (w - 1)->space + w->length;	/* w > start >= words */
	}
      while (len < max_width);
      start->best_cost = best + base_cost (start);
    }

  word_limit->length = saved_length;
}

/* Return the constant component of the cost of breaking before the
   word THIS.  */

static COST
base_cost (register WORD *this)
{
  register COST cost;

  cost = LINE_COST;

  if (this > words)
    {
      if ((this - 1)->period)
	{
	  if ((this - 1)->final)
	    cost -= SENTENCE_BONUS;
	  else
	    cost += NOBREAK_COST;
	}
      else if ((this - 1)->punct)
	cost -= PUNCT_BONUS;
      else if (this > words + 1 && (this - 2)->final)
	cost += WIDOW_COST ((this - 1)->length);
    }

  if (this->paren)
    cost -= PAREN_BONUS;
  else if (this->final)
    cost += ORPHAN_COST (this->length);

  return cost;
}

/* Return the component of the cost of breaking before word NEXT that
   depends on LEN, the length of the line beginning there.  */

static COST
line_cost (register WORD *next, register int len)
{
  register int n;
  register COST cost;

  if (next == word_limit)
    return 0;
  n = best_width - len;
  cost = SHORT_COST (n);
  if (next->next_break != word_limit)
    {
      n = len - next->line_length;
      cost += RAGGED_COST (n);
    }
  return cost;
}

/* Output to stdout a paragraph from word up to (but not including)
   FINISH, which must be in the next_break chain from word.  */

static void
put_paragraph (register WORD *finish)
{
  register WORD *w;

  put_line (words, first_indent);
  for (w = words->next_break; w != finish; w = w->next_break)
    put_line (w, other_indent);
}

/* Output to stdout the line beginning with word W, beginning in column
   INDENT, including the prefix (if any).  */

static void
put_line (register WORD *w, int indent)
{
  register WORD *endline;
  out_column = 0;
  put_space (indent);

  endline = w->next_break - 1;
  for (; w != endline; w++)
    {
      put_word (w);
      put_space (w->space);
    }
  put_word (w);
  last_line_length = out_column;
  putc ('\n', outfile);
}

/* Output to stdout the word W.  */

static void
put_word (register WORD *w)
{
  register const char *s;
  register int n;

  s = w->text;
  for (n = w->length; n != 0; n--)
    putc (*s++, outfile);
  out_column += w->length;
}

/* Output to stdout SPACE spaces, or equivalent tabs.  */

static void
put_space (int space)
{
  out_column += space;
  while (space--)
    putc (' ', outfile);
}