/*
 * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2 of the GNU General Public License as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it would be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 *
 * Further, this software is distributed without any warranty that it is
 * free of the rightful claim of any third person regarding infringement
 * or the like.  Any license provided herein, whether implied or
 * otherwise, applies only to this software file.  Patent licenses, if
 * any, provided herein do not apply to combinations of this program with
 * other software, or any other product whatsoever.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
 * Mountain View, CA  94043, or:
 *
 * http://www.sgi.com
 *
 * For further information regarding this notice, see:
 *
 * http://oss.sgi.com/projects/GenInfo/NoticeExplan/
 */
/*
 * doio -	a general purpose io initiator with system call and
 *		write logging.  See doio.h for the structure which defines
 *		what doio requests should look like.
 *
 *		Currently doio can handle read,write,reada,writea,ssread,
 *		sswrite, and many varieties of listio requests.
 *		For disk io, if the O_SSD flag is set doio will allocate
 *		the appropriate amount of ssd and do the transfer - thus, doio
 *		can handle all of the primitive types of file io.
 *
 * programming
 * notes:
 * -----------
 *	messages should generally be printed using doio_fprintf().
 *
 */

#include <stdio.h>
#include <errno.h>
#include <fcntl.h>
#include <stdlib.h>
#include <signal.h>
#include <string.h>
#include <ctype.h>
#include <unistd.h>
#include <time.h>
#include <stdarg.h>
#include <sys/stat.h>
#include <sys/param.h>
#include <sys/types.h>
#include <sys/sysmacros.h>
#ifdef CRAY
#include <sys/iosw.h>
#endif
#ifdef sgi
#include <aio.h>		/* for aio_read,write */
#include <inttypes.h>		/* for uint64_t type */
#include <siginfo.h>		/* signal handlers & SA_SIGINFO */
#endif
#ifndef CRAY
#include <sys/uio.h>		/* for struct iovec (readv) */
#include <sys/mman.h>		/* for mmap(2) */
#include <sys/ipc.h>		/* for i/o buffer in shared memory */
#include <sys/shm.h>		/* for i/o buffer in shared memory */
#endif
#include <sys/wait.h>
#ifdef CRAY
#include <sys/listio.h>
#include <sys/panic.h>
#endif
#include <sys/time.h>		/* for delays */

#include "doio.h"
#include "write_log.h"
#include "random_range.h"
#include "string_to_tokens.h"
#include "pattern.h"

#define	NMEMALLOC	32
#define	MEM_DATA	1	/* data space                           */
#define	MEM_SHMEM	2	/* System V shared memory               */
#define	MEM_T3ESHMEM	3	/* T3E Shared Memory                    */
#define	MEM_MMAP	4	/* mmap(2)                              */

#define	MEMF_PRIVATE	0001
#define	MEMF_AUTORESRV	0002
#define	MEMF_LOCAL	0004
#define	MEMF_SHARED	0010

#define	MEMF_FIXADDR	0100
#define	MEMF_ADDR	0200
#define	MEMF_AUTOGROW	0400
#define	MEMF_FILE	01000	/* regular file -- unlink on close      */
#define	MEMF_MPIN	010000	/* use mpin(2) to lock pages in memory */

struct memalloc {
	int memtype;
	int flags;
	int nblks;
	char *name;
	void *space;		/* memory address of allocated space */
	int fd;			/* FD open for mmaping */
	int size;
} Memalloc[NMEMALLOC];

/*
 * Structure for maintaining open file test descriptors.  Used by
 * alloc_fd().
 */

struct fd_cache {
	char c_file[MAX_FNAME_LENGTH + 1];
	int c_oflags;
	int c_fd;
	long c_rtc;
#ifdef sgi
	int c_memalign;		/* from F_DIOINFO */
	int c_miniosz;
	int c_maxiosz;
#endif
#ifndef CRAY
	void *c_memaddr;	/* mmapped address */
	int c_memlen;		/* length of above region */
#endif
};

/*
 * Name-To-Value map
 * Used to map cmdline arguments to values
 */
struct smap {
	char *string;
	int value;
};

struct aio_info {
	int busy;
	int id;
	int fd;
	int strategy;
	volatile int done;
#ifdef CRAY
	struct iosw iosw;
#endif
#ifdef sgi
	aiocb_t aiocb;
	int aio_ret;		/* from aio_return */
	int aio_errno;		/* from aio_error */
#endif
	int sig;
	int signalled;
	struct sigaction osa;
};

/* ---------------------------------------------------------------------------
 *
 * A new paradigm of doing the r/w system call where there is a "stub"
 * function that builds the info for the system call, then does the system
 * call; this is called by code that is common to all system calls and does
 * the syscall return checking, async I/O wait, iosw check, etc.
 *
 * Flags:
 *	WRITE, ASYNC, SSD/SDS,
 *	FILE_LOCK, WRITE_LOG, VERIFY_DATA,
 */

struct status {
	int rval;		/* syscall return */
	int err;		/* errno */
	int *aioid;		/* list of async I/O structures */
};

struct syscall_info {
	char *sy_name;
	int sy_type;
	struct status *(*sy_syscall) ();
	int (*sy_buffer) ();
	char *(*sy_format) ();
	int sy_flags;
	int sy_bits;
};

#define	SY_WRITE		00001
#define	SY_ASYNC		00010
#define	SY_IOSW			00020
#define	SY_SDS			00100

#ifndef O_SSD
#define O_SSD 0			/* so code compiles on a CRAY2 */
#endif

#ifdef sgi
#define UINT64_T uint64_t
#else
#define UINT64_T unsigned long
#endif

#ifndef O_PARALLEL
#define O_PARALLEL 0		/* so O_PARALLEL may be used in expressions */
#endif

#define PPID_CHECK_INTERVAL 5	/* check ppid every <-- iterations */
#define	MAX_AIO		256	/* maximum number of async I/O ops */
#ifdef _CRAYMPP
#define	MPP_BUMP	16	/* page un-alignment for MPP */
#else
#define	MPP_BUMP	0
#endif

#define	SYSERR strerror(errno)

/*
 * getopt() string of supported cmdline arguments.
 */

#define OPTS	"aC:d:ehm:n:kr:w:vU:V:M:N:"

#define DEF_RELEASE_INTERVAL	0

/*
 * Flags set in parse_cmdline() to indicate which options were selected
 * on the cmdline.
 */

int a_opt = 0;			/* abort on data compare errors     */
int e_opt = 0;			/* exec() after fork()'ing          */
int C_opt = 0;			/* Data Check Type                  */
int d_opt = 0;			/* delay between operations         */
int k_opt = 0;			/* lock file regions during writes  */
int m_opt = 0;			/* generate periodic messages       */
int n_opt = 0;			/* nprocs                           */
int r_opt = 0;			/* resource release interval        */
int w_opt = 0;			/* file write log file              */
int v_opt = 0;			/* verify writes if set             */
int U_opt = 0;			/* upanic() on varios conditions    */
int V_opt = 0;			/* over-ride default validation fd type */
int M_opt = 0;			/* data buffer allocation types     */
char TagName[40];		/* name of this doio (see Monster)  */

/*
 * Misc globals initialized in parse_cmdline()
 */

char *Prog = NULL;		/* set up in parse_cmdline()                */
int Upanic_Conditions;		/* set by args to -U                        */
int Release_Interval;		/* arg to -r                                */
int Nprocs;			/* arg to -n                                */
char *Write_Log;		/* arg to -w                                */
char *Infile;			/* input file (defaults to stdin)           */
int *Children;			/* pids of child procs                      */
int Nchildren = 0;
int Nsiblings = 0;		/* tfork'ed siblings                        */
int Execd = 0;
int Message_Interval = 0;
int Npes = 0;			/* non-zero if built as an mpp multi-pe app */
int Vpe = -1;			/* Virtual pe number if Npes >= 0           */
int Reqno = 1;			/* request # - used in some error messages  */
int Reqskipcnt = 0;		/* count of I/O requests that are skipped   */
int Validation_Flags;
char *(*Data_Check) ();		/* function to call for data checking       */
int (*Data_Fill) ();		/* function to call for data filling        */
int Nmemalloc = 0;		/* number of memory allocation strategies   */
int delayop = 0;		/* delay between operations - type of delay */
int delaytime = 0;		/* delay between operations - how long      */

struct wlog_file Wlog;

int active_mmap_rw = 0;		/* Indicates that mmapped I/O is occurring. */
			    /* Used by sigbus_action() in the child doio. */
int havesigint = 0;

#define SKIP_REQ	-2	/* skip I/O request */

/*
 * Global file descriptors
 */

int Wfd_Append;			/* for appending to the write-log       */
int Wfd_Random;			/* for overlaying write-log entries     */

#define FD_ALLOC_INCR	32	/* allocate this many fd_map structs    */
				/* at a time */

/*
 * Globals for tracking Sds and Core usage
 */

char *Memptr;			/* ptr to core buffer space             */
int Memsize;			/* # bytes pointed to by Memptr         */
				/* maintained by alloc_mem()            */

int Sdsptr;			/* sds offset (always 0)                */
int Sdssize;			/* # bytes of allocated sds space       */
				/* Maintained by alloc_sds()            */
char Host[16];
char Pattern[128];
int Pattern_Length;

/*
 * Signal handlers, and related globals
 */

char *syserrno(int err);
void doio(void);
void doio_delay(void);
char *format_oflags(int oflags);
char *format_strat(int strategy);
char *format_rw(struct io_req *ioreq, int fd, void *buffer,
		int signo, char *pattern, void *iosw);
#ifdef CRAY
char *format_sds(struct io_req *ioreq, void *buffer, int sds char *pattern);
#endif /* CRAY */

int do_read(struct io_req *req);
int do_write(struct io_req *req);
int lock_file_region(char *fname, int fd, int type, int start, int nbytes);

#ifdef CRAY
char *format_listio(struct io_req *ioreq, int lcmd,
		    struct listreq *list, int nent, int fd, char *pattern);
#endif /* CRAY */

int do_listio(struct io_req *req);

#if defined(_CRAY1) || defined(CRAY)
int do_ssdio(struct io_req *req);
#endif /* defined(_CRAY1) || defined(CRAY) */

char *fmt_ioreq(struct io_req *ioreq, struct syscall_info *sy, int fd);

#ifdef CRAY
struct status *sy_listio(struct io_req *req, struct syscall_info *sysc,
			 int fd, char *addr);
int listio_mem(struct io_req *req, int offset, int fmstride,
	       int *min, int *max);
char *fmt_listio(struct io_req *req, struct syscall_info *sy,
		 int fd, char *addr);
#endif /* CRAY */

#ifdef sgi
struct status *sy_pread(struct io_req *req, struct syscall_info *sysc,
			int fd, char *addr);
struct status *sy_pwrite(struct io_req *req, struct syscall_info *sysc,
			 int fd, char *addr);
char *fmt_pread(struct io_req *req, struct syscall_info *sy,
		int fd, char *addr);
#endif /* sgi */

#ifndef CRAY
struct status *sy_readv(struct io_req *req, struct syscall_info *sysc,
			int fd, char *addr);
struct status *sy_writev(struct io_req *req, struct syscall_info *sysc,
			 int fd, char *addr);
struct status *sy_rwv(struct io_req *req, struct syscall_info *sysc,
		      int fd, char *addr, int rw);
char *fmt_readv(struct io_req *req, struct syscall_info *sy,
		int fd, char *addr);
#endif /* !CRAY */

#ifdef sgi
struct status *sy_aread(struct io_req *req, struct syscall_info *sysc,
			int fd, char *addr);
struct status *sy_awrite(struct io_req *req, struct syscall_info *sysc,
			 int fd, char *addr)
struct status *sy_arw(struct io_req *req, struct syscall_info *sysc,
		      int fd, char *addr, int rw);
char *fmt_aread(struct io_req *req, struct syscall_info *sy,
		int fd, char *addr);
#endif /* sgi */

#ifndef CRAY
struct status *sy_mmread(struct io_req *req, struct syscall_info *sysc,
			 int fd, char *addr);
struct status *sy_mmwrite(struct io_req *req, struct syscall_info *sysc,
			  int fd, char *addr);
struct status *sy_mmrw(struct io_req *req, struct syscall_info *sysc,
		       int fd, char *addr, int rw);
char *fmt_mmrw(struct io_req *req, struct syscall_info *sy, int fd, char *addr);
#endif /* !CRAY */

int do_rw(struct io_req *req);

#ifdef sgi
int do_fcntl(struct io_req *req);
#endif /* sgi */

#ifndef CRAY
int do_sync(struct io_req *req);
#endif /* !CRAY */

int doio_pat_fill(char *addr, int mem_needed, char *Pattern,
		  int Pattern_Length, int shift);
char *doio_pat_check(char *buf, int offset, int length,
		     char *pattern, int pattern_length, int patshift);
char *check_file(char *file, int offset, int length, char *pattern,
		 int pattern_length, int patshift, int fsa);
int doio_fprintf(FILE * stream, char *format, ...);
int alloc_mem(int nbytes);

#if defined(_CRAY1) || defined(CRAY)
int alloc_sds(int nbytes);
#endif /* defined(_CRAY1) || defined(CRAY) */

int alloc_fd(char *file, int oflags);
struct fd_cache *alloc_fdcache(char *file, int oflags);

#ifdef sgi
void signal_info(int sig, siginfo_t * info, void *v);
void cleanup_handler(int sig, siginfo_t * info, void *v);
void die_handler(int sig, siginfo_t * info, void *v);
void sigbus_handler(int sig, siginfo_t * info, void *v);
#else /* !sgi */
void cleanup_handler(int sig);
void die_handler(int sig);

#ifndef CRAY
void sigbus_handler(int sig);
#endif /* !CRAY */
#endif /* sgi */

void noop_handler(int sig);
void sigint_handler(int sig);
void aio_handler(int sig);
void dump_aio(void);

#ifdef sgi
void cb_handler(sigval_t val);
#endif /* sgi */

struct aio_info *aio_slot(int aio_id);
int aio_register(int fd, int strategy, int sig);
int aio_unregister(int aio_id);

#ifndef __linux__
int aio_wait(int aio_id);
#endif /* !__linux__ */

char *hms(time_t t);
int aio_done(struct aio_info *ainfo);
void doio_upanic(int mask);
int parse_cmdline(int argc, char **argv, char *opts);

#ifndef CRAY
void parse_memalloc(char *arg);
void dump_memalloc(void);
#endif /* !CRAY */

void parse_delay(char *arg);
int usage(FILE * stream);
void help(FILE * stream);

/*
 * Upanic conditions, and a map from symbolics to values
 */

#define U_CORRUPTION	0001	/* upanic on data corruption    */
#define U_IOSW	    	0002	/* upanic on bad iosw           */
#define U_RVAL	    	0004	/* upanic on bad rval           */

#define U_ALL	    	(U_CORRUPTION | U_IOSW | U_RVAL)

struct smap Upanic_Args[] = {
	{"corruption", U_CORRUPTION},
	{"iosw", U_IOSW},
	{"rval", U_RVAL},
	{"all", U_ALL},
	{NULL, 0}
};

struct aio_info Aio_Info[MAX_AIO];

/* -C data-fill/check type */
#define	C_DEFAULT	1
struct smap checkmap[] = {
	{"default", C_DEFAULT},
	{NULL, 0},
};

/* -d option delay types */
#define	DELAY_SELECT	1
#define	DELAY_SLEEP	2
#define	DELAY_SGINAP	3
#define	DELAY_ALARM	4
#define	DELAY_ITIMER	5	/* POSIX timer                          */

struct smap delaymap[] = {
	{"select", DELAY_SELECT},
	{"sleep", DELAY_SLEEP},
#ifdef sgi
	{"sginap", DELAY_SGINAP},
#endif
	{"alarm", DELAY_ALARM},
	{NULL, 0},
};

/******
*
* strerror() does similar actions.

char *
syserrno(int err)
{
    static char sys_errno[10];
    sprintf(sys_errno, "%d", errno);
    return(sys_errno);
}

******/

int main(int argc, char **argv)
{
	int i, pid, stat, ex_stat;
#ifdef CRAY
	sigset_t omask;
#elif defined(linux)
	sigset_t omask, block_mask;
#else
	int omask;
#endif
	struct sigaction sa;

	umask(0);		/* force new file modes to known values */
#if _CRAYMPP
	Npes = sysconf(_SC_CRAY_NPES);	/* must do this before parse_cmdline */
	Vpe = sysconf(_SC_CRAY_VPE);
#endif

	TagName[0] = '\0';
	parse_cmdline(argc, argv, OPTS);

	random_range_seed(getpid());	/* initialize random number generator */

	/*
	 * If this is a re-exec of doio, jump directly into the doio function.
	 */

	if (Execd) {
		doio();
		exit(E_SETUP);
	}

	/*
	 * Stop on all but a few signals...
	 */
	sigemptyset(&sa.sa_mask);
	sa.sa_handler = sigint_handler;
	sa.sa_flags = SA_RESETHAND;	/* sigint is ignored after the */
	/* first time */
	for (i = 1; i <= NSIG; i++) {
		switch (i) {
#ifdef SIGRECOVERY
		case SIGRECOVERY:
			break;
#endif
#ifdef SIGCKPT
		case SIGCKPT:
#endif
#ifdef SIGRESTART
		case SIGRESTART:
#endif
		case SIGTSTP:
		case SIGSTOP:
		case SIGCONT:
		case SIGCHLD:
		case SIGBUS:
		case SIGSEGV:
		case SIGQUIT:
			break;
		default:
			sigaction(i, &sa, NULL);
		}
	}

	/*
	 * If we're logging write operations, make a dummy call to wlog_open
	 * to initialize the write history file.  This call must be done in
	 * the parent, to ensure that the history file exists and/or has
	 * been truncated before any children attempt to open it, as the doio
	 * children are not allowed to truncate the file.
	 */

	if (w_opt) {
		strcpy(Wlog.w_file, Write_Log);

		if (wlog_open(&Wlog, 1, 0666) < 0) {
			doio_fprintf(stderr,
				     "Could not create/truncate write log %s\n",
				     Write_Log);
			exit(2);
		}

		wlog_close(&Wlog);
	}

	/*
	 * Malloc space for the children pid array.  Initialize all entries
	 * to -1.
	 */

	Children = malloc(sizeof(int) * Nprocs);
	for (i = 0; i < Nprocs; i++) {
		Children[i] = -1;
	}

	sigemptyset(&block_mask);
	sigaddset(&block_mask, SIGCHLD);
	sigprocmask(SIG_BLOCK, &block_mask, &omask);

	/*
	 * Fork Nprocs.  This [parent] process is a watchdog, to notify the
	 * invoker of procs which exit abnormally, and to make sure that all
	 * child procs get cleaned up.  If the -e option was used, we will also
	 * re-exec.  This is mostly for unicos/mk on mpp's, to ensure that not
	 * all of the doio's don't end up in the same pe.
	 *
	 * Note - if Nprocs is 1, or this doio is a multi-pe app (Npes > 1),
	 * jump directly to doio().  multi-pe apps can't fork(), and there is
	 * no reason to fork() for 1 proc.
	 */

	if (Nprocs == 1 || Npes > 1) {
		doio();
		exit(0);
	} else {
		for (i = 0; i < Nprocs; i++) {
			if ((pid = fork()) == -1) {
				doio_fprintf(stderr,
					     "(parent) Could not fork %d children:  %s (%d)\n",
					     i + 1, SYSERR, errno);
				exit(E_SETUP);
			}

			Children[Nchildren] = pid;
			Nchildren++;

			if (pid == 0) {
				if (e_opt) {
					char *exec_path;

					exec_path = argv[0];
					argv[0] = malloc(strlen(exec_path) + 2);
					sprintf(argv[0], "-%s", exec_path);

					execvp(exec_path, argv);
					doio_fprintf(stderr,
						     "(parent) Could not execvp %s:  %s (%d)\n",
						     exec_path, SYSERR, errno);
					exit(E_SETUP);
				} else {
					doio();
					exit(E_SETUP);
				}
			}
		}

		/*
		 * Parent spins on wait(), until all children exit.
		 */

		ex_stat = E_NORMAL;

		while (Nprocs) {
			if ((pid = wait(&stat)) == -1) {
				if (errno == EINTR)
					continue;
			}

			for (i = 0; i < Nchildren; i++)
				if (Children[i] == pid)
					Children[i] = -1;

			Nprocs--;

			if (WIFEXITED(stat)) {
				switch (WEXITSTATUS(stat)) {
				case E_NORMAL:
					/* noop */
					break;

				case E_INTERNAL:
					doio_fprintf(stderr,
						     "(parent) pid %d exited because of an internal error\n",
						     pid);
					ex_stat |= E_INTERNAL;
					break;

				case E_SETUP:
					doio_fprintf(stderr,
						     "(parent) pid %d exited because of a setup error\n",
						     pid);
					ex_stat |= E_SETUP;
					break;

				case E_COMPARE:
					doio_fprintf(stderr,
						     "(parent) pid %d exited because of data compare errors\n",
						     pid);

					ex_stat |= E_COMPARE;

					if (a_opt)
						kill(0, SIGINT);

					break;

				case E_USAGE:
					doio_fprintf(stderr,
						     "(parent) pid %d exited because of a usage error\n",
						     pid);

					ex_stat |= E_USAGE;
					break;

				default:
					doio_fprintf(stderr,
						     "(parent) pid %d exited with unknown status %d\n",
						     pid, WEXITSTATUS(stat));
					ex_stat |= E_INTERNAL;
					break;
				}
			} else if (WIFSIGNALED(stat)
				   && WTERMSIG(stat) != SIGINT) {
				doio_fprintf(stderr,
					     "(parent) pid %d terminated by signal %d\n",
					     pid, WTERMSIG(stat));

				ex_stat |= E_SIGNAL;
			}

			fflush(NULL);
		}
	}

	exit(ex_stat);

}				/* main */

/*
 * main doio function.  Each doio child starts here, and never returns.
 */

void doio(void)
{
	int rval, i, infd, nbytes;
	char *cp;
	struct io_req ioreq;
	struct sigaction sa, def_action, ignore_action, exit_action;
#ifndef CRAY
	struct sigaction sigbus_action;
#endif

	Memsize = Sdssize = 0;

	/*
	 * Initialize the Pattern - write-type syscalls will replace Pattern[1]
	 * with the pattern passed in the request.  Make sure that
	 * strlen(Pattern) is not mod 16 so that out of order words will be
	 * detected.
	 */

	gethostname(Host, sizeof(Host));
	if ((cp = strchr(Host, '.')) != NULL)
		*cp = '\0';

	Pattern_Length = sprintf(Pattern, "-:%d:%s:%s*", getpid(), Host, Prog);

	if (!(Pattern_Length % 16)) {
		Pattern_Length = sprintf(Pattern, "-:%d:%s:%s**",
					 getpid(), Host, Prog);
	}

	/*
	 * Open a couple of descriptors for the write-log file.  One descriptor
	 * is for appending, one for random access.  Write logging is done for
	 * file corruption detection.  The program doio_check is capable of
	 * doing corruption detection based on a doio write-log.
	 */

	if (w_opt) {

		strcpy(Wlog.w_file, Write_Log);

		if (wlog_open(&Wlog, 0, 0666) == -1) {
			doio_fprintf(stderr,
				     "Could not open write log file (%s): wlog_open() failed\n",
				     Write_Log);
			exit(E_SETUP);
		}
	}

	/*
	 * Open the input stream - either a file or stdin
	 */

	if (Infile == NULL) {
		infd = 0;
	} else {
		if ((infd = open(Infile, O_RDWR)) == -1) {
			doio_fprintf(stderr,
				     "Could not open input file (%s):  %s (%d)\n",
				     Infile, SYSERR, errno);
			exit(E_SETUP);
		}
	}

	/*
	 * Define a set of signals that should never be masked.  Receipt of
	 * these signals generally indicates a programming error, and we want
	 * a corefile at the point of error.  We put SIGQUIT in this list so
	 * that ^\ will force a user core dump.
	 *
	 * Note:  the handler for these should be SIG_DFL, all of them
	 * produce a corefile as the default action.
	 */

	ignore_action.sa_handler = SIG_IGN;
	ignore_action.sa_flags = 0;
	sigemptyset(&ignore_action.sa_mask);

	def_action.sa_handler = SIG_DFL;
	def_action.sa_flags = 0;
	sigemptyset(&def_action.sa_mask);

#ifdef sgi
	exit_action.sa_sigaction = cleanup_handler;
	exit_action.sa_flags = SA_SIGINFO;
	sigemptyset(&exit_action.sa_mask);

	sa.sa_sigaction = die_handler;
	sa.sa_flags = SA_SIGINFO;
	sigemptyset(&sa.sa_mask);

	sigbus_action.sa_sigaction = sigbus_handler;
	sigbus_action.sa_flags = SA_SIGINFO;
	sigemptyset(&sigbus_action.sa_mask);
#else
	exit_action.sa_handler = cleanup_handler;
	exit_action.sa_flags = 0;
	sigemptyset(&exit_action.sa_mask);

	sa.sa_handler = die_handler;
	sa.sa_flags = 0;
	sigemptyset(&sa.sa_mask);

#ifndef CRAY
	sigbus_action.sa_handler = sigbus_handler;
	sigbus_action.sa_flags = 0;
	sigemptyset(&sigbus_action.sa_mask);
#endif
#endif

	for (i = 1; i <= NSIG; i++) {
		switch (i) {
			/* Signals to terminate program on */
		case SIGINT:
			sigaction(i, &exit_action, NULL);
			break;

#ifndef CRAY
			/* This depends on active_mmap_rw */
		case SIGBUS:
			sigaction(i, &sigbus_action, NULL);
			break;
#endif

			/* Signals to Ignore... */
		case SIGSTOP:
		case SIGCONT:
#ifdef SIGRECOVERY
		case SIGRECOVERY:
#endif
			sigaction(i, &ignore_action, NULL);
			break;

			/* Signals to trap & report & die */
			/*case SIGTRAP: */
			/*case SIGABRT: */
#ifdef SIGERR			/* cray only signals */
		case SIGERR:
		case SIGBUFIO:
		case SIGINFO:
#endif
			/*case SIGFPE: */
		case SIGURG:
		case SIGHUP:
		case SIGTERM:
		case SIGPIPE:
		case SIGIO:
		case SIGUSR1:
		case SIGUSR2:
			sigaction(i, &sa, NULL);
			break;

			/* Default Action for all other signals */
		default:
			sigaction(i, &def_action, NULL);
			break;
		}
	}

	/*
	 * Main loop - each doio proc does this until the read returns eof (0).
	 * Call the appropriate io function based on the request type.
	 */

	while ((nbytes = read(infd, (char *)&ioreq, sizeof(ioreq)))) {

		/*
		 * Periodically check our ppid.  If it is 1, the child exits to
		 * help clean up in the case that the main doio process was
		 * killed.
		 */

		if (Reqno && ((Reqno % PPID_CHECK_INTERVAL) == 0)) {
			if (getppid() == 1) {
				doio_fprintf(stderr,
					     "Parent doio process has exited\n");
				alloc_mem(-1);
				exit(E_SETUP);
			}
		}

		if (nbytes == -1) {
			doio_fprintf(stderr,
				     "read of %d bytes from input failed:  %s (%d)\n",
				     sizeof(ioreq), SYSERR, errno);
			alloc_mem(-1);
			exit(E_SETUP);
		}

		if (nbytes != sizeof(ioreq)) {
			doio_fprintf(stderr,
				     "read wrong # bytes from input stream, expected %d, got %d\n",
				     sizeof(ioreq), nbytes);
			alloc_mem(-1);
			exit(E_SETUP);
		}

		if (ioreq.r_magic != DOIO_MAGIC) {
			doio_fprintf(stderr,
				     "got a bad magic # from input stream.  Expected 0%o, got 0%o\n",
				     DOIO_MAGIC, ioreq.r_magic);
			alloc_mem(-1);
			exit(E_SETUP);
		}

		/*
		 * If we're on a Release_Interval multiple, relase all ssd and
		 * core space, and close all fd's in Fd_Map[].
		 */

		if (Reqno && Release_Interval && !(Reqno % Release_Interval)) {
			if (Memsize) {
#ifdef NOTDEF
				sbrk(-1 * Memsize);
#else
				alloc_mem(-1);
#endif
			}
#ifdef _CRAY1
			if (Sdssize) {
				ssbreak(-1 * btoc(Sdssize));
				Sdsptr = 0;
				Sdssize = 0;
			}
#endif /* _CRAY1 */

			alloc_fd(NULL, 0);
		}

		switch (ioreq.r_type) {
		case READ:
		case READA:
			rval = do_read(&ioreq);
			break;

		case WRITE:
		case WRITEA:
			rval = do_write(&ioreq);
			break;

		case READV:
		case AREAD:
		case PREAD:
		case LREAD:
		case LREADA:
		case LSREAD:
		case LSREADA:
		case WRITEV:
		case AWRITE:
		case PWRITE:
		case MMAPR:
		case MMAPW:
		case LWRITE:
		case LWRITEA:
		case LSWRITE:
		case LSWRITEA:
		case LEREAD:
		case LEREADA:
		case LEWRITE:
		case LEWRITEA:
			rval = do_rw(&ioreq);
			break;

#ifdef CRAY
		case SSREAD:
		case SSWRITE:
			rval = do_ssdio(&ioreq);
			break;

		case LISTIO:
			rval = do_listio(&ioreq);
			break;
#endif

#ifdef sgi
		case RESVSP:
		case UNRESVSP:
#ifdef F_FSYNC
		case DFFSYNC:
#endif
			rval = do_fcntl(&ioreq);
			break;
#endif /* sgi */

#ifndef CRAY
		case FSYNC2:
		case FDATASYNC:
			rval = do_sync(&ioreq);
			break;
#endif
		default:
			doio_fprintf(stderr,
				     "Don't know how to handle io request type %d\n",
				     ioreq.r_type);
			alloc_mem(-1);
			exit(E_SETUP);
		}

		if (rval == SKIP_REQ) {
			Reqskipcnt++;
		} else if (rval != 0) {
			alloc_mem(-1);
			doio_fprintf(stderr,
				     "doio(): operation %d returned != 0\n",
				     ioreq.r_type);
			exit(E_SETUP);
		}

		if (Message_Interval && Reqno % Message_Interval == 0) {
			doio_fprintf(stderr,
				     "Info:  %d requests done (%d skipped) by this process\n",
				     Reqno, Reqskipcnt);
		}

		Reqno++;

		if (delayop != 0)
			doio_delay();
	}

	/*
	 * Child exits normally
	 */
	alloc_mem(-1);
	exit(E_NORMAL);

}				/* doio */

void doio_delay(void)
{
	struct timeval tv_delay;
	struct sigaction sa_al, sa_old;
	sigset_t al_mask;

	switch (delayop) {
	case DELAY_SELECT:
		tv_delay.tv_sec = delaytime / 1000000;
		tv_delay.tv_usec = delaytime % 1000000;
		/*doio_fprintf(stdout, "delay_select: %d %d\n",
		   tv_delay.tv_sec, tv_delay.tv_usec); */
		select(0, NULL, NULL, NULL, &tv_delay);
		break;

	case DELAY_SLEEP:
		sleep(delaytime);
		break;

#ifdef sgi
	case DELAY_SGINAP:
		sginap(delaytime);
		break;
#endif

	case DELAY_ALARM:
		sa_al.sa_flags = 0;
		sa_al.sa_handler = noop_handler;
		sigemptyset(&sa_al.sa_mask);
		sigaction(SIGALRM, &sa_al, &sa_old);
		sigemptyset(&al_mask);
		alarm(delaytime);
		sigsuspend(&al_mask);
		sigaction(SIGALRM, &sa_old, 0);
		break;
	}
}

/*
 * Format IO requests, returning a pointer to the formatted text.
 *
 * format_strat	- formats the async i/o completion strategy
 * format_rw	- formats a read[a]/write[a] request
 * format_sds	- formats a ssread/sswrite request
 * format_listio- formats a listio request
 *
 * ioreq is the doio io request structure.
 */

struct smap sysnames[] = {
	{"READ", READ},
	{"WRITE", WRITE},
	{"READA", READA},
	{"WRITEA", WRITEA},
	{"SSREAD", SSREAD},
	{"SSWRITE", SSWRITE},
	{"LISTIO", LISTIO},
	{"LREAD", LREAD},
	{"LREADA", LREADA},
	{"LWRITE", LWRITE},
	{"LWRITEA", LWRITEA},
	{"LSREAD", LSREAD},
	{"LSREADA", LSREADA},
	{"LSWRITE", LSWRITE},
	{"LSWRITEA", LSWRITEA},

	/* Irix System Calls */
	{"PREAD", PREAD},
	{"PWRITE", PWRITE},
	{"AREAD", AREAD},
	{"AWRITE", AWRITE},
	{"LLREAD", LLREAD},
	{"LLAREAD", LLAREAD},
	{"LLWRITE", LLWRITE},
	{"LLAWRITE", LLAWRITE},
	{"RESVSP", RESVSP},
	{"UNRESVSP", UNRESVSP},
	{"DFFSYNC", DFFSYNC},

	/* Irix and Linux System Calls */
	{"READV", READV},
	{"WRITEV", WRITEV},
	{"MMAPR", MMAPR},
	{"MMAPW", MMAPW},
	{"FSYNC2", FSYNC2},
	{"FDATASYNC", FDATASYNC},

	{"unknown", -1},
};

struct smap aionames[] = {
	{"poll", A_POLL},
	{"signal", A_SIGNAL},
	{"recall", A_RECALL},
	{"recalla", A_RECALLA},
	{"recalls", A_RECALLS},
	{"suspend", A_SUSPEND},
	{"callback", A_CALLBACK},
	{"synch", 0},
	{"unknown", -1},
};

char *format_oflags(int oflags)
{
	char flags[255];

	flags[0] = '\0';
	switch (oflags & 03) {
	case O_RDONLY:
		strcat(flags, "O_RDONLY,");
		break;
	case O_WRONLY:
		strcat(flags, "O_WRONLY,");
		break;
	case O_RDWR:
		strcat(flags, "O_RDWR,");
		break;
	default:
		strcat(flags, "O_weird");
		break;
	}

	if (oflags & O_EXCL)
		strcat(flags, "O_EXCL,");

	if (oflags & O_SYNC)
		strcat(flags, "O_SYNC,");
#ifdef CRAY
	if (oflags & O_RAW)
		strcat(flags, "O_RAW,");
	if (oflags & O_WELLFORMED)
		strcat(flags, "O_WELLFORMED,");
#ifdef O_SSD
	if (oflags & O_SSD)
		strcat(flags, "O_SSD,");
#endif
	if (oflags & O_LDRAW)
		strcat(flags, "O_LDRAW,");
	if (oflags & O_PARALLEL)
		strcat(flags, "O_PARALLEL,");
	if (oflags & O_BIG)
		strcat(flags, "O_BIG,");
	if (oflags & O_PLACE)
		strcat(flags, "O_PLACE,");
	if (oflags & O_ASYNC)
		strcat(flags, "O_ASYNC,");
#endif

#ifdef sgi
	if (oflags & O_DIRECT)
		strcat(flags, "O_DIRECT,");
	if (oflags & O_DSYNC)
		strcat(flags, "O_DSYNC,");
	if (oflags & O_RSYNC)
		strcat(flags, "O_RSYNC,");
#endif

	return (strdup(flags));
}

char *format_strat(int strategy)
{
	char msg[64];
	char *aio_strat;

	switch (strategy) {
	case A_POLL:
		aio_strat = "POLL";
		break;
	case A_SIGNAL:
		aio_strat = "SIGNAL";
		break;
	case A_RECALL:
		aio_strat = "RECALL";
		break;
	case A_RECALLA:
		aio_strat = "RECALLA";
		break;
	case A_RECALLS:
		aio_strat = "RECALLS";
		break;
	case A_SUSPEND:
		aio_strat = "SUSPEND";
		break;
	case A_CALLBACK:
		aio_strat = "CALLBACK";
		break;
	case 0:
		aio_strat = "<zero>";
		break;
	default:
		sprintf(msg, "<error:%#o>", strategy);
		aio_strat = strdup(msg);
		break;
	}

	return (aio_strat);
}

char *format_rw(struct io_req *ioreq, int fd, void *buffer, int signo,
		char *pattern, void *iosw)
{
	static char *errbuf = NULL;
	char *aio_strat, *cp;
	struct read_req *readp = &ioreq->r_data.read;
	struct write_req *writep = &ioreq->r_data.write;
	struct read_req *readap = &ioreq->r_data.read;
	struct write_req *writeap = &ioreq->r_data.write;

	if (errbuf == NULL)
		errbuf = malloc(32768);

	cp = errbuf;
	cp += sprintf(cp, "Request number %d\n", Reqno);

	switch (ioreq->r_type) {
	case READ:
		cp += sprintf(cp, "syscall:  read(%d, %#lo, %d)\n",
			      fd, (unsigned long)buffer, readp->r_nbytes);
		cp +=
		    sprintf(cp,
			    "          fd %d is file %s - open flags are %#o\n",
			    fd, readp->r_file, readp->r_oflags);
		cp +=
		    sprintf(cp, "          read done at file offset %d\n",
			    readp->r_offset);
		break;

	case WRITE:
		cp += sprintf(cp, "syscall:  write(%d, %#lo, %d)\n",
			      fd, (unsigned long)buffer, writep->r_nbytes);
		cp +=
		    sprintf(cp,
			    "          fd %d is file %s - open flags are %#o\n",
			    fd, writep->r_file, writep->r_oflags);
		cp +=
		    sprintf(cp,
			    "          write done at file offset %d - pattern is %s\n",
			    writep->r_offset, pattern);
		break;

	case READA:
		aio_strat = format_strat(readap->r_aio_strat);

		cp += sprintf(cp, "syscall:  reada(%d, %#lo, %d, %#lo, %d)\n",
			      fd, (unsigned long)buffer, readap->r_nbytes,
			      (unsigned long)iosw, signo);
		cp +=
		    sprintf(cp,
			    "          fd %d is file %s - open flags are %#o\n",
			    fd, readap->r_file, readp->r_oflags);
		cp +=
		    sprintf(cp, "          reada done at file offset %d\n",
			    readap->r_offset);
		cp +=
		    sprintf(cp,
			    "          async io completion strategy is %s\n",
			    aio_strat);
		break;

	case WRITEA:
		aio_strat = format_strat(writeap->r_aio_strat);

		cp += sprintf(cp, "syscall:  writea(%d, %#lo, %d, %#lo, %d)\n",
			      fd, (unsigned long)buffer, writeap->r_nbytes,
			      (unsigned long)iosw, signo);
		cp +=
		    sprintf(cp,
			    "          fd %d is file %s - open flags are %#o\n",
			    fd, writeap->r_file, writeap->r_oflags);
		cp +=
		    sprintf(cp,
			    "          writea done at file offset %d - pattern is %s\n",
			    writeap->r_offset, pattern);
		cp +=
		    sprintf(cp,
			    "          async io completion strategy is %s\n",
			    aio_strat);
		break;

	}

	return errbuf;
}

#ifdef CRAY
char *format_sds(struct io_req *ioreq, void *buffer, int sds, char *pattern)
{
	int i;
	static char *errbuf = NULL;
	char *cp;

	struct ssread_req *ssreadp = &ioreq->r_data.ssread;
	struct sswrite_req *sswritep = &ioreq->r_data.sswrite;

	if (errbuf == NULL)
		errbuf = malloc(32768);

	cp = errbuf;
	cp += sprintf(cp, "Request number %d\n", Reqno);

	switch (ioreq->r_type) {
	case SSREAD:
		cp += sprintf(cp, "syscall:  ssread(%#o, %#o, %d)\n",
			      buffer, sds, ssreadp->r_nbytes);
		break;

	case SSWRITE:
		cp +=
		    sprintf(cp,
			    "syscall:  sswrite(%#o, %#o, %d) - pattern was %s\n",
			    buffer, sds, sswritep->r_nbytes, pattern);
		break;
	}
	return errbuf;
}
#endif /* CRAY */

/*
 * Perform the various sorts of disk reads
 */

int do_read(struct io_req *req)
{
	int fd, offset, nbytes, oflags, rval;
	char *addr, *file;
#ifdef CRAY
	struct aio_info *aiop;
	int aio_id, aio_strat, signo;
#endif
#ifdef sgi
	struct fd_cache *fdc;
#endif

	/*
	 * Initialize common fields - assumes r_oflags, r_file, r_offset, and
	 * r_nbytes are at the same offset in the read_req and reada_req
	 * structures.
	 */

	file = req->r_data.read.r_file;
	oflags = req->r_data.read.r_oflags;
	offset = req->r_data.read.r_offset;
	nbytes = req->r_data.read.r_nbytes;

	/*printf("read: %s, %#o, %d %d\n", file, oflags, offset, nbytes); */

	/*
	 * Grab an open file descriptor
	 * Note: must be done before memory allocation so that the direct i/o
	 *      information is available in mem. allocate
	 */

	if ((fd = alloc_fd(file, oflags)) == -1)
		return -1;

	/*
	 * Allocate core or sds - based on the O_SSD flag
	 */

#ifndef wtob
#define wtob(x)	(x * sizeof(UINT64_T))
#endif

#ifdef CRAY
	if (oflags & O_SSD) {
		if (alloc_sds(nbytes) == -1)
			return -1;

		addr = (char *)Sdsptr;
	} else {
		if ((rval =
		     alloc_mem(nbytes + wtob(1) * 2 +
			       MPP_BUMP * sizeof(UINT64_T))) < 0) {
			return rval;
		}

		addr = Memptr;

		/*
		 * if io is not raw, bump the offset by a random amount
		 * to generate non-word-aligned io.
		 */
		if (!(req->r_data.read.r_uflags & F_WORD_ALIGNED)) {
			addr += random_range(0, wtob(1) - 1, 1, NULL);
		}
	}
#else
#ifdef sgi
	/* get memory alignment for using DIRECT I/O */
	fdc = alloc_fdcache(file, oflags);

	if ((rval = alloc_mem(nbytes + wtob(1) * 2 + fdc->c_memalign)) < 0) {
		return rval;
	}

	addr = Memptr;

	if ((req->r_data.read.r_uflags & F_WORD_ALIGNED)) {
		/*
		 * Force memory alignment for Direct I/O
		 */
		if ((oflags & O_DIRECT) && ((long)addr % fdc->c_memalign != 0)) {
			addr +=
			    fdc->c_memalign - ((long)addr % fdc->c_memalign);
		}
	} else {
		addr += random_range(0, wtob(1) - 1, 1, NULL);
	}
#else
	/* what is !CRAY && !sgi ? */
	if ((rval = alloc_mem(nbytes + wtob(1) * 2)) < 0) {
		return rval;
	}

	addr = Memptr;
#endif /* !CRAY && sgi */
#endif /* CRAY */

	switch (req->r_type) {
	case READ:
		/* move to the desired file position. */
		if (lseek(fd, offset, SEEK_SET) == -1) {
			doio_fprintf(stderr,
				     "lseek(%d, %d, SEEK_SET) failed:  %s (%d)\n",
				     fd, offset, SYSERR, errno);
			return -1;
		}

		if ((rval = read(fd, addr, nbytes)) == -1) {
			doio_fprintf(stderr,
				     "read() request failed:  %s (%d)\n%s\n",
				     SYSERR, errno,
				     format_rw(req, fd, addr, -1, NULL, NULL));
			doio_upanic(U_RVAL);
			return -1;
		} else if (rval != nbytes) {
			doio_fprintf(stderr,
				     "read() request returned wrong # of bytes - expected %d, got %d\n%s\n",
				     nbytes, rval,
				     format_rw(req, fd, addr, -1, NULL, NULL));
			doio_upanic(U_RVAL);
			return -1;
		}
		break;

#ifdef CRAY
	case READA:
		/*
		 * Async read
		 */

		/* move to the desired file position. */
		if (lseek(fd, offset, SEEK_SET) == -1) {
			doio_fprintf(stderr,
				     "lseek(%d, %d, SEEK_SET) failed:  %s (%d)\n",
				     fd, offset, SYSERR, errno);
			return -1;
		}

		aio_strat = req->r_data.read.r_aio_strat;
		signo = (aio_strat == A_SIGNAL) ? SIGUSR1 : 0;

		aio_id = aio_register(fd, aio_strat, signo);
		aiop = aio_slot(aio_id);

		if (reada(fd, addr, nbytes, &aiop->iosw, signo) == -1) {
			doio_fprintf(stderr, "reada() failed: %s (%d)\n%s\n",
				     SYSERR, errno,
				     format_rw(req, fd, addr, signo, NULL,
					       &aiop->iosw));
			aio_unregister(aio_id);
			doio_upanic(U_RVAL);
			rval = -1;
		} else {
			/*
			 * Wait for io to complete
			 */

			aio_wait(aio_id);

			/*
			 * make sure the io completed without error
			 */

			if (aiop->iosw.sw_count != nbytes) {
				doio_fprintf(stderr,
					     "Bad iosw from reada()\nExpected (%d,%d,%d), got (%d,%d,%d)\n%s\n",
					     1, 0, nbytes,
					     aiop->iosw.sw_flag,
					     aiop->iosw.sw_error,
					     aiop->iosw.sw_count,
					     format_rw(req, fd, addr, signo,
						       NULL, &aiop->iosw));
				aio_unregister(aio_id);
				doio_upanic(U_IOSW);
				rval = -1;
			} else {
				aio_unregister(aio_id);
				rval = 0;
			}
		}

		if (rval == -1)
			return rval;
		break;
#endif /* CRAY */
	}

	return 0;		/* if we get here, everything went ok */
}

/*
 * Perform the verious types of disk writes.
 */

int do_write(struct io_req *req)
{
	static int pid = -1;
	int fd, nbytes, oflags, signo;
	int logged_write, rval, got_lock;
	off_t offset, woffset;
	char *addr, pattern, *file, *msg;
	struct wlog_rec wrec;
#ifdef CRAY
	int aio_strat, aio_id;
	struct aio_info *aiop;
#endif
#ifdef sgi
	struct fd_cache *fdc;
#endif

	woffset = 0;

	/*
	 * Misc variable setup
	 */

	signo = 0;
	nbytes = req->r_data.write.r_nbytes;
	offset = req->r_data.write.r_offset;
	pattern = req->r_data.write.r_pattern;
	file = req->r_data.write.r_file;
	oflags = req->r_data.write.r_oflags;

	/*printf("pwrite: %s, %#o, %d %d\n", file, oflags, offset, nbytes); */

	/*
	 * Allocate core memory and possibly sds space.  Initialize the data
	 * to be written.
	 */

	Pattern[0] = pattern;

	/*
	 * Get a descriptor to do the io on
	 */

	if ((fd = alloc_fd(file, oflags)) == -1)
		return -1;

	/*printf("write: %d, %s, %#o, %d %d\n",
	   fd, file, oflags, offset, nbytes); */

	/*
	 * Allocate SDS space for backdoor write if desired
	 */

#ifdef CRAY
	if (oflags & O_SSD) {
#ifndef _CRAYMPP
		if ((rval = alloc_mem(nbytes + wtob(1))) < 0) {
			return rval;
		}

		(*Data_Fill) (Memptr, nbytes, Pattern, Pattern_Length, 0);
		/*pattern_fill(Memptr, nbytes, Pattern, Pattern_Length, 0); */

		if (alloc_sds(nbytes) == -1)
			return -1;

		if (sswrite((long)Memptr, Sdsptr, btoc(nbytes)) == -1) {
			doio_fprintf(stderr,
				     "sswrite(%d, %d, %d) failed:  %s (%d)\n",
				     (long)Memptr, Sdsptr, btoc(nbytes), SYSERR,
				     errno);
			fflush(stderr);
			return -1;
		}

		addr = (char *)Sdsptr;
#else
		doio_fprintf(stderr,
			     "Invalid O_SSD flag was generated for MPP system\n");
		fflush(stderr);
		return -1;
#endif /* !CRAYMPP */
	} else {
		if ((rval = alloc_mem(nbytes + wtob(1)) < 0)) {
			return rval;
		}

		addr = Memptr;

		/*
		 * if io is not raw, bump the offset by a random amount
		 * to generate non-word-aligned io.
		 */

		if (!(req->r_data.write.r_uflags & F_WORD_ALIGNED)) {
			addr += random_range(0, wtob(1) - 1, 1, NULL);
		}

		(*Data_Fill) (Memptr, nbytes, Pattern, Pattern_Length, 0);
		if (addr != Memptr)
			memmove(addr, Memptr, nbytes);
	}
#else /* CRAY */
#ifdef sgi
	/* get memory alignment for using DIRECT I/O */
	fdc = alloc_fdcache(file, oflags);

	if ((rval = alloc_mem(nbytes + wtob(1) * 2 + fdc->c_memalign)) < 0) {
		return rval;
	}

	addr = Memptr;

	if ((req->r_data.write.r_uflags & F_WORD_ALIGNED)) {
		/*
		 * Force memory alignment for Direct I/O
		 */
		if ((oflags & O_DIRECT) && ((long)addr % fdc->c_memalign != 0)) {
			addr +=
			    fdc->c_memalign - ((long)addr % fdc->c_memalign);
		}
	} else {
		addr += random_range(0, wtob(1) - 1, 1, NULL);
	}

	(*Data_Fill) (Memptr, nbytes, Pattern, Pattern_Length, 0);
	if (addr != Memptr)
		memmove(addr, Memptr, nbytes);

#else /* sgi */
	if ((rval = alloc_mem(nbytes + wtob(1) * 2)) < 0) {
		return rval;
	}

	addr = Memptr;

	(*Data_Fill) (Memptr, nbytes, Pattern, Pattern_Length, 0);
	if (addr != Memptr)
		memmove(addr, Memptr, nbytes);
#endif /* sgi */
#endif /* CRAY */

	rval = -1;
	got_lock = 0;
	logged_write = 0;

	if (k_opt) {
		if (lock_file_region(file, fd, F_WRLCK, offset, nbytes) < 0) {
			alloc_mem(-1);
			exit(E_INTERNAL);
		}

		got_lock = 1;
	}

	/*
	 * Write a preliminary write-log entry.  This is done so that
	 * doio_check can do corruption detection across an interrupt/crash.
	 * Note that w_done is set to 0.  If doio_check sees this, it
	 * re-creates the file extents as if the write completed, but does not
	 * do any checking - see comments in doio_check for more details.
	 */

	if (w_opt) {
		if (pid == -1) {
			pid = getpid();
		}
		wrec.w_async = (req->r_type == WRITEA) ? 1 : 0;
		wrec.w_oflags = oflags;
		wrec.w_pid = pid;
		wrec.w_offset = offset;
		wrec.w_nbytes = nbytes;

		wrec.w_pathlen = strlen(file);
		memcpy(wrec.w_path, file, wrec.w_pathlen);
		wrec.w_hostlen = strlen(Host);
		memcpy(wrec.w_host, Host, wrec.w_hostlen);
		wrec.w_patternlen = Pattern_Length;
		memcpy(wrec.w_pattern, Pattern, wrec.w_patternlen);

		wrec.w_done = 0;

		if ((woffset = wlog_record_write(&Wlog, &wrec, -1)) == -1) {
			doio_fprintf(stderr,
				     "Could not append to write-log:  %s (%d)\n",
				     SYSERR, errno);
		} else {
			logged_write = 1;
		}
	}

	switch (req->r_type) {
	case WRITE:
		/*
		 * sync write
		 */

		if (lseek(fd, offset, SEEK_SET) == -1) {
			doio_fprintf(stderr,
				     "lseek(%d, %d, SEEK_SET) failed:  %s (%d)\n",
				     fd, offset, SYSERR, errno);
			return -1;
		}

		rval = write(fd, addr, nbytes);

		if (rval == -1) {
			doio_fprintf(stderr,
				     "write() failed:  %s (%d)\n%s\n",
				     SYSERR, errno,
				     format_rw(req, fd, addr, -1, Pattern,
					       NULL));
#ifdef sgi
			doio_fprintf(stderr,
				     "write() failed:  %s\n\twrite(%d, %#o, %d)\n\toffset %d, nbytes%%miniou(%d)=%d, oflags=%#o memalign=%d, addr%%memalign=%d\n",
				     strerror(errno),
				     fd, addr, nbytes,
				     offset,
				     fdc->c_miniosz, nbytes % fdc->c_miniosz,
				     oflags, fdc->c_memalign,
				     (long)addr % fdc->c_memalign);
#else
			doio_fprintf(stderr,
				     "write() failed:  %s\n\twrite(%d, %#o, %d)\n\toffset %d, nbytes%%1B=%d, oflags=%#o\n",
				     strerror(errno),
				     fd, addr, nbytes,
				     offset, nbytes % 4096, oflags);
#endif
			doio_upanic(U_RVAL);
		} else if (rval != nbytes) {
			doio_fprintf(stderr,
				     "write() returned wrong # bytes - expected %d, got %d\n%s\n",
				     nbytes, rval,
				     format_rw(req, fd, addr, -1, Pattern,
					       NULL));
			doio_upanic(U_RVAL);
			rval = -1;
		}

		break;

#ifdef CRAY
	case WRITEA:
		/*
		 * async write
		 */
		if (lseek(fd, offset, SEEK_SET) == -1) {
			doio_fprintf(stderr,
				     "lseek(%d, %d, SEEK_SET) failed:  %s (%d)\n",
				     fd, offset, SYSERR, errno);
			return -1;
		}

		aio_strat = req->r_data.write.r_aio_strat;
		signo = (aio_strat == A_SIGNAL) ? SIGUSR1 : 0;

		aio_id = aio_register(fd, aio_strat, signo);
		aiop = aio_slot(aio_id);

		/*
		 * init iosw and do the async write
		 */

		if (writea(fd, addr, nbytes, &aiop->iosw, signo) == -1) {
			doio_fprintf(stderr,
				     "writea() failed: %s (%d)\n%s\n",
				     SYSERR, errno,
				     format_rw(req, fd, addr, -1, Pattern,
					       NULL));
			doio_upanic(U_RVAL);
			aio_unregister(aio_id);
			rval = -1;
		} else {

			/*
			 * Wait for io to complete
			 */

			aio_wait(aio_id);

			/*
			 * check that iosw is ok
			 */

			if (aiop->iosw.sw_count != nbytes) {
				doio_fprintf(stderr,
					     "Bad iosw from writea()\nExpected (%d,%d,%d), got (%d,%d,%d)\n%s\n",
					     1, 0, nbytes,
					     aiop->iosw.sw_flag,
					     aiop->iosw.sw_error,
					     aiop->iosw.sw_count,
					     format_rw(req, fd, addr, -1,
						       Pattern, &aiop->iosw));
				aio_unregister(aio_id);
				doio_upanic(U_IOSW);
				rval = -1;
			} else {
				aio_unregister(aio_id);
				rval = 0;
			}
		}
		break;

#endif /* CRAY */
	}

	/*
	 * Verify that the data was written correctly - check_file() returns
	 * a non-null pointer which contains an error message if there are
	 * problems.
	 */

	if (v_opt) {
		msg = check_file(file, offset, nbytes, Pattern, Pattern_Length,
				 0, oflags & O_PARALLEL);
		if (msg != NULL) {
			doio_fprintf(stderr, "%s%s\n", msg,
#ifdef CRAY
				     format_rw(req, fd, addr, -1, Pattern,
					       &aiop->iosw)
#else
				     format_rw(req, fd, addr, -1, Pattern, NULL)
#endif
			    );
			doio_upanic(U_CORRUPTION);
			exit(E_COMPARE);

		}
	}

	/*
	 * General cleanup ...
	 *
	 * Write extent information to the write-log, so that doio_check can do
	 * corruption detection.  Note that w_done is set to 1, indicating that
	 * the write has been verified as complete.  We don't need to write the
	 * filename on the second logging.
	 */

	if (w_opt && logged_write) {
		wrec.w_done = 1;
		wlog_record_write(&Wlog, &wrec, woffset);
	}

	/*
	 * Unlock file region if necessary
	 */

	if (got_lock) {
		if (lock_file_region(file, fd, F_UNLCK, offset, nbytes) < 0) {
			alloc_mem(-1);
			exit(E_INTERNAL);
		}
	}

	return ((rval == -1) ? -1 : 0);
}

/*
 * Simple routine to lock/unlock a file using fcntl()
 */

int lock_file_region(char *fname, int fd, int type, int start, int nbytes)
{
	struct flock flk;

	flk.l_type = type;
	flk.l_whence = 0;
	flk.l_start = start;
	flk.l_len = nbytes;

	if (fcntl(fd, F_SETLKW, &flk) < 0) {
		doio_fprintf(stderr,
			     "fcntl(%d, %d, %#o) failed for file %s, lock type %d, offset %d, length %d:  %s (%d), open flags: %#o\n",
			     fd, F_SETLKW, &flk, fname, type,
			     start, nbytes, SYSERR, errno,
			     fcntl(fd, F_GETFL, 0));
		return -1;
	}

	return 0;
}

/*
 * Perform a listio request.
 */

#ifdef CRAY
char *format_listio(struct io_req *ioreq, int lcmd, struct listreq *list,
		    int nent, int fd, char *pattern)
{
	static char *errbuf = NULL;
	struct listio_req *liop = &ioreq->r_data.listio;
	struct listreq *listreq;
	char *cp, *cmd, *opcode, *aio_strat;
	int i;

	switch (lcmd) {
	case LC_START:
		cmd = "LC_START";
		break;
	case LC_WAIT:
		cmd = "LC_WAIT";
		break;
	default:
		cmd = "???";
		break;
	}

	if (errbuf == NULL)
		errbuf = malloc(32768);

	cp = errbuf;
	cp += sprintf(cp, "Request number %d\n", Reqno);

	cp += sprintf(cp, "syscall:  listio(%s, %#o, %d)\n\n", cmd, list, nent);

	aio_strat = format_strat(liop->r_aio_strat);

	for (i = 0; i < nent; i++) {
		cp += sprintf(cp, "struct lioreq for request element %d\n", i);
		cp += sprintf(cp, "----------------------------------------\n");

		listreq = list + i;

		switch (listreq->li_opcode) {
		case LO_READ:
			opcode = "LO_READ";
			break;
		case LO_WRITE:
			opcode = "LO_WRITE";
			break;
		default:
			opcode = "???";
			break;
		}

		cp += sprintf(cp, "          li_opcode =    %s\n", opcode);
		cp +=
		    sprintf(cp, "          li_drvr =      %#o\n",
			    listreq->li_drvr);
		cp +=
		    sprintf(cp, "          li_flags =     %#o\n",
			    listreq->li_flags);
		cp +=
		    sprintf(cp, "          li_offset =    %d\n",
			    listreq->li_offset);
		cp +=
		    sprintf(cp, "          li_fildes =    %d\n",
			    listreq->li_fildes);
		cp +=
		    sprintf(cp, "          li_buf =       %#o\n",
			    listreq->li_buf);
		cp +=
		    sprintf(cp, "          li_nbyte =     %d\n",
			    listreq->li_nbyte);
		cp +=
		    sprintf(cp, "          li_status =    %#o (%d, %d, %d)\n",
			    listreq->li_status, listreq->li_status->sw_flag,
			    listreq->li_status->sw_error,
			    listreq->li_status->sw_count);
		cp +=
		    sprintf(cp, "          li_signo =     %d\n",
			    listreq->li_signo);
		cp +=
		    sprintf(cp, "          li_nstride =   %d\n",
			    listreq->li_nstride);
		cp +=
		    sprintf(cp, "          li_filstride = %d\n",
			    listreq->li_filstride);
		cp +=
		    sprintf(cp, "          li_memstride = %d\n",
			    listreq->li_memstride);
		cp +=
		    sprintf(cp, "          io completion strategy is %s\n",
			    aio_strat);
	}
	return errbuf;
}
#endif /* CRAY */

int do_listio(struct io_req *req)
{
#ifdef CRAY
	struct listio_req *lio;
	int fd, oflags, signo, nb, i;
	int logged_write, rval, got_lock;
	int aio_strat, aio_id;
	int min_byte, max_byte;
	int mem_needed;
	int foffset, fstride, mstride, nstrides;
	char *moffset;
	long offset, woffset;
	char *addr, *msg;
	sigset_t block_mask, omask;
	struct wlog_rec wrec;
	struct aio_info *aiop;
	struct listreq lio_req;

	lio = &req->r_data.listio;

	/*
	 * If bytes per stride is less than the stride size, drop the request
	 * since it will cause overlapping strides, and we cannot predict
	 * the order they will complete in.
	 */

	if (lio->r_filestride && abs(lio->r_filestride) < lio->r_nbytes) {
		doio_fprintf(stderr,
			     "do_listio():  Bogus listio request - abs(filestride) [%d] < nbytes [%d]\n",
			     abs(lio->r_filestride), lio->r_nbytes);
		return -1;
	}

	/*
	 * Allocate core memory.  Initialize the data to be written.  Make
	 * sure we get enough, based on the memstride.
	 */

	mem_needed =
	    stride_bounds(0, lio->r_memstride, lio->r_nstrides,
			  lio->r_nbytes, NULL, NULL);

	if ((rval = alloc_mem(mem_needed + wtob(1))) < 0) {
		return rval;
	}

	/*
	 * Set the memory address pointer.  If the io is not raw, adjust
	 * addr by a random amount, so that non-raw io is not necessarily
	 * word aligned.
	 */

	addr = Memptr;

	if (!(lio->r_uflags & F_WORD_ALIGNED)) {
		addr += random_range(0, wtob(1) - 1, 1, NULL);
	}

	if (lio->r_opcode == LO_WRITE) {
		Pattern[0] = lio->r_pattern;
		(*Data_Fill) (Memptr, mem_needed, Pattern, Pattern_Length, 0);
		if (addr != Memptr)
			memmove(addr, Memptr, mem_needed);
	}

	/*
	 * Get a descriptor to do the io on.  No need to do an lseek, as this
	 * is encoded in the listio request.
	 */

	if ((fd = alloc_fd(lio->r_file, lio->r_oflags)) == -1) {
		return -1;
	}

	rval = -1;
	got_lock = 0;
	logged_write = 0;

	/*
	 * If the opcode is LO_WRITE, lock all regions of the file that
	 * are touched by this listio request.  Currently, we use
	 * stride_bounds() to figure out the min and max bytes affected, and
	 * lock the entire region, regardless of the file stride.
	 */

	if (lio->r_opcode == LO_WRITE && k_opt) {
		stride_bounds(lio->r_offset,
			      lio->r_filestride, lio->r_nstrides,
			      lio->r_nbytes, &min_byte, &max_byte);

		if (lock_file_region(lio->r_file, fd, F_WRLCK,
				     min_byte, (max_byte - min_byte + 1)) < 0) {
			doio_fprintf(stderr,
				     "stride_bounds(%d, %d, %d, %d, ..., ...) set min_byte to %d, max_byte to %d\n",
				     lio->r_offset, lio->r_filestride,
				     lio->r_nstrides, lio->r_nbytes, min_byte,
				     max_byte);
			return -1;
		} else {
			got_lock = 1;
		}
	}

	/*
	 * async write
	 */

	aio_strat = lio->r_aio_strat;
	signo = (aio_strat == A_SIGNAL) ? SIGUSR1 : 0;

	aio_id = aio_register(fd, aio_strat, signo);
	aiop = aio_slot(aio_id);

	/*
	 * Form the listio request, and make the call.
	 */

	lio_req.li_opcode = lio->r_opcode;
	lio_req.li_drvr = 0;
	lio_req.li_flags = LF_LSEEK;
	lio_req.li_offset = lio->r_offset;
	lio_req.li_fildes = fd;

	if (lio->r_memstride >= 0 || lio->r_nstrides <= 1) {
		lio_req.li_buf = addr;
	} else {
		lio_req.li_buf = addr + mem_needed - lio->r_nbytes;
	}

	lio_req.li_nbyte = lio->r_nbytes;
	lio_req.li_status = &aiop->iosw;
	lio_req.li_signo = signo;
	lio_req.li_nstride = lio->r_nstrides;
	lio_req.li_filstride = lio->r_filestride;
	lio_req.li_memstride = lio->r_memstride;

	/*
	 * If signo != 0, block signo while we're in the system call, so that
	 * we don't get interrupted syscall failures.
	 */

	if (signo) {
		sigemptyset(&block_mask);
		sigaddset(&block_mask, signo);
		sigprocmask(SIG_BLOCK, &block_mask, &omask);
	}

	if (listio(lio->r_cmd, &lio_req, 1) < 0) {
		doio_fprintf(stderr,
			     "listio() failed: %s (%d)\n%s\n",
			     SYSERR, errno,
			     format_listio(req, lio->r_cmd, &lio_req, 1, fd,
					   Pattern));
		aio_unregister(aio_id);
		doio_upanic(U_RVAL);
		goto lio_done;
	}

	if (signo) {
		sigprocmask(SIG_SETMASK, &omask, NULL);
	}

	/*
	 * Wait for io to complete
	 */

	aio_wait(aio_id);

	nstrides = lio->r_nstrides ? lio->r_nstrides : 1;
	if (aiop->iosw.sw_count != lio->r_nbytes * nstrides) {
		doio_fprintf(stderr,
			     "Bad iosw from listio()\nExpected (%d,%d,%d), got (%d,%d,%d)\n%s\n",
			     1, 0, lio->r_nbytes * lio->r_nstrides,
			     aiop->iosw.sw_flag,
			     aiop->iosw.sw_error, aiop->iosw.sw_count,
			     format_listio(req, lio->r_cmd, &lio_req, 1, fd,
					   Pattern));
		aio_unregister(aio_id);
		doio_upanic(U_IOSW);
		goto lio_done;
	}

	aio_unregister(aio_id);

	/*
	 * Verify that the data was written correctly - check_file() returns
	 * a non-null pointer which contains an error message if there are
	 * problems.
	 *
	 * For listio, we basically have to make 1 call to check_file for each
	 * stride.
	 */

	if (v_opt && lio_req.li_opcode == LO_WRITE) {
		fstride = lio->r_filestride ? lio->r_filestride : lio->r_nbytes;
		mstride = lio->r_memstride ? lio->r_memstride : lio->r_nbytes;
		foffset = lio->r_offset;

		if (mstride > 0 || lio->r_nstrides <= 1) {
			moffset = addr;
		} else {
			moffset = addr + mem_needed - lio->r_nbytes;
		}

		for (i = 0; i < lio_req.li_nstride; i++) {
			msg = check_file(lio->r_file,
					 foffset, lio->r_nbytes,
					 Pattern, Pattern_Length,
					 moffset - addr,
					 lio->r_oflags & O_PARALLEL);

			if (msg != NULL) {
				doio_fprintf(stderr, "%s\n%s\n",
					     msg,
					     format_listio(req, lio->r_cmd,
							   &lio_req, 1, fd,
							   Pattern));
				doio_upanic(U_CORRUPTION);
				exit(E_COMPARE);
			}

			moffset += mstride;
			foffset += fstride;
		}

	}

	rval = 0;

lio_done:

	/*
	 * General cleanup ...
	 *
	 */

	/*
	 * Release file locks if necessary
	 */

	if (got_lock) {
		if (lock_file_region(lio->r_file, fd, F_UNLCK,
				     min_byte, (max_byte - min_byte + 1)) < 0) {
			return -1;
		}
	}

	return rval;
#else
	return -1;
#endif
}

/*
 * perform ssread/sswrite operations
 */

#ifdef _CRAY1

int do_ssdio(struct io_req *req)
{
	int nbytes, nb;
	char errbuf[BSIZE];

	nbytes = req->r_data.ssread.r_nbytes;

	/*
	 * Grab core and sds space
	 */

	if ((nb = alloc_mem(nbytes)) < 0)
		return nb;

	if (alloc_sds(nbytes) == -1)
		return -1;

	if (req->r_type == SSWRITE) {

		/*
		 * Init data and ship it to the ssd
		 */

		Pattern[0] = req->r_data.sswrite.r_pattern;
		/*pattern_fill(Memptr, nbytes, Pattern, Pattern_Length, 0); */
		(*Data_Fill) (Memptr, nbytes, Pattern, Pattern_Length, 0);

		if (sswrite((long)Memptr, (long)Sdsptr, btoc(nbytes)) == -1) {
			doio_fprintf(stderr, "sswrite() failed:  %s (%d)\n%s\n",
				     SYSERR, errno,
				     format_sds(req, Memptr, Sdsptr, Pattern));
			doio_upanic(U_RVAL);
			return -1;
		}
	} else {
		/*
		 * read from sds
		 */

		if (ssread((long)Memptr, (long)Sdsptr, btoc(nbytes)) == -1) {
			doio_fprintf(stderr, "ssread() failed: %s (%d)\n%s\n",
				     SYSERR, errno,
				     format_sds(req, Memptr, Sdsptr, Pattern));

			doio_upanic(U_RVAL);
			return -1;
		}
	}

	/*
	 * Verify data if SSWRITE and v_opt
	 */

	if (v_opt && req->r_type == SSWRITE) {
		ssread((long)Memptr, (long)Sdsptr, btoc(nbytes));

		if (pattern_check(Memptr, nbytes, Pattern, Pattern_Length, 0) ==
		    -1) {
			doio_fprintf(stderr,
				     "sds DATA COMPARE ERROR - ABORTING\n%s\n",
				     format_sds(req, Memptr, Sdsptr, Pattern));

			doio_upanic(U_CORRUPTION);
			exit(E_COMPARE);
		}
	}
}

#else

#ifdef CRAY

int do_ssdio(struct io_req *req)
{
	doio_fprintf(stderr,
		     "Internal Error - do_ssdio() called on a non-cray1 system\n");
	alloc_mem(-1);
	exit(E_INTERNAL);
}

#endif /* CRAY */

#endif /* _CRAY1 */

char *fmt_ioreq(struct io_req *ioreq, struct syscall_info *sy, int fd)
{
	static char *errbuf = NULL;
	char *cp;
	struct rw_req *io;
	struct smap *aname;
#ifdef CRAY
	struct stat sbuf;
#endif
#ifdef sgi
	struct dioattr finfo;
#endif

	if (errbuf == NULL)
		errbuf = malloc(32768);

	io = &ioreq->r_data.io;

	/*
	 * Look up async I/O completion strategy
	 */
	for (aname = aionames;
	     aname->value != -1 && aname->value != io->r_aio_strat; aname++) ;

	cp = errbuf;
	cp += sprintf(cp, "Request number %d\n", Reqno);

	cp +=
	    sprintf(cp, "          fd %d is file %s - open flags are %#o %s\n",
		    fd, io->r_file, io->r_oflags, format_oflags(io->r_oflags));

	if (sy->sy_flags & SY_WRITE) {
		cp +=
		    sprintf(cp,
			    "          write done at file offset %d - pattern is %c (%#o)\n",
			    io->r_offset,
			    (io->r_pattern == '\0') ? '?' : io->r_pattern,
			    io->r_pattern);
	} else {
		cp += sprintf(cp, "          read done at file offset %d\n",
			      io->r_offset);
	}

	if (sy->sy_flags & SY_ASYNC) {
		cp +=
		    sprintf(cp,
			    "          async io completion strategy is %s\n",
			    aname->string);
	}

	cp +=
	    sprintf(cp,
		    "          number of requests is %d, strides per request is %d\n",
		    io->r_nent, io->r_nstrides);

	cp += sprintf(cp, "          i/o byte count = %d\n", io->r_nbytes);

	cp += sprintf(cp, "          memory alignment is %s\n",
		      (io->
		       r_uflags & F_WORD_ALIGNED) ? "aligned" : "unaligned");

#ifdef CRAY
	if (io->r_oflags & O_RAW) {
		cp +=
		    sprintf(cp,
			    "          RAW I/O: offset %% 4096 = %d length %% 4096 = %d\n",
			    io->r_offset % 4096, io->r_nbytes % 4096);
		fstat(fd, &sbuf);
		cp +=
		    sprintf(cp,
			    "          optimal file xfer size: small: %d large: %d\n",
			    sbuf.st_blksize, sbuf.st_oblksize);
		cp +=
		    sprintf(cp, "          cblks %d cbits %#o\n", sbuf.st_cblks,
			    sbuf.st_cbits);
	}
#endif
#ifdef sgi
	if (io->r_oflags & O_DIRECT) {

		if (fcntl(fd, F_DIOINFO, &finfo) == -1) {
			cp +=
			    sprintf(cp,
				    "          Error %s (%d) getting direct I/O info\n",
				    strerror(errno), errno);
			finfo.d_mem = 1;
			finfo.d_miniosz = 1;
			finfo.d_maxiosz = 1;
		}

		cp +=
		    sprintf(cp,
			    "          DIRECT I/O: offset %% %d = %d length %% %d = %d\n",
			    finfo.d_miniosz, io->r_offset % finfo.d_miniosz,
			    io->r_nbytes, io->r_nbytes % finfo.d_miniosz);
		cp +=
		    sprintf(cp,
			    "          mem alignment 0x%x xfer size: small: %d large: %d\n",
			    finfo.d_mem, finfo.d_miniosz, finfo.d_maxiosz);
	}
#endif

	return (errbuf);
}

/*
 * Issue listio requests
 */
#ifdef CRAY
struct status *sy_listio(struct io_req *req, struct syscall_info *sysc, int fd,
			 char *addr)
{
	int offset, nbytes, nstrides, nents, aio_strat;
	int aio_id, signo, o, i, lc;
	char *a;
	struct listreq *lio_req, *l;
	struct aio_info *aiop;
	struct status *status;

	/*
	 * Initialize common fields - assumes r_oflags, r_file, r_offset, and
	 * r_nbytes are at the same offset in the read_req and reada_req
	 * structures.
	 */
	offset = req->r_data.io.r_offset;
	nbytes = req->r_data.io.r_nbytes;
	nstrides = req->r_data.io.r_nstrides;
	nents = req->r_data.io.r_nent;
	aio_strat = req->r_data.io.r_aio_strat;

	lc = (sysc->sy_flags & SY_ASYNC) ? LC_START : LC_WAIT;

	status = malloc(sizeof(struct status));
	if (status == NULL) {
		doio_fprintf(stderr, "malloc failed, %s/%d\n",
			     __FILE__, __LINE__);
		return NULL;
	}
	status->aioid = malloc((nents + 1) * sizeof(int));
	if (status->aioid == NULL) {
		doio_fprintf(stderr, "malloc failed, %s/%d\n",
			     __FILE__, __LINE__);
		return NULL;
	}

	signo = (aio_strat == A_SIGNAL) ? SIGUSR1 : 0;

	lio_req = malloc(nents * sizeof(struct listreq));
	if (lio_req == NULL) {
		doio_fprintf(stderr, "malloc failed, %s/%d\n",
			     __FILE__, __LINE__);
		return NULL;
	}
	for (l = lio_req, a = addr, o = offset, i = 0;
	     i < nents; l++, a += nbytes, o += nbytes, i++) {

		aio_id = aio_register(fd, aio_strat, signo);
		aiop = aio_slot(aio_id);
		status->aioid[i] = aio_id;

		l->li_opcode = (sysc->sy_flags & SY_WRITE) ? LO_WRITE : LO_READ;
		l->li_offset = o;
		l->li_fildes = fd;
		l->li_buf = a;
		l->li_nbyte = nbytes;
		l->li_status = &aiop->iosw;
		l->li_signo = signo;
		l->li_nstride = nstrides;
		l->li_filstride = 0;
		l->li_memstride = 0;
		l->li_drvr = 0;
		l->li_flags = LF_LSEEK;
	}

	status->aioid[nents] = -1;	/* end sentinel */

	if ((status->rval = listio(lc, lio_req, nents)) == -1) {
		status->err = errno;
	}

	free(lio_req);
	return (status);
}

/*
 * Calculate the size of a request in bytes and min/max boundaries
 *
 * This assumes filestride & memstride = 0.
 */
int listio_mem(struct io_req *req, int offset, int fmstride, int *min, int *max)
{
	int i, size;

	size = stride_bounds(offset, fmstride,
			     req->r_data.io.r_nstrides * req->r_data.io.r_nent,
			     req->r_data.io.r_nbytes, min, max);
	return (size);
}

char *fmt_listio(struct io_req *req, struct syscall_info *sy, int fd,
		 char *addr)
{
	static char *errbuf = NULL;
	char *cp;
	char *c, *opcode;
	int i;

	if (errbuf == NULL) {
		errbuf = malloc(32768);
		if (errbuf == NULL) {
			doio_fprintf(stderr, "malloc failed, %s/%d\n",
				     __FILE__, __LINE__);
			return NULL;
		}
	}

	c = (sy->sy_flags & SY_ASYNC) ? "lc_wait" : "lc_start";

	cp = errbuf;
	cp += sprintf(cp, "syscall:  listio(%s, (?), %d)\n",
		      c, req->r_data.io.r_nent);

	cp += sprintf(cp, "          data buffer at %#o\n", addr);

	return (errbuf);
}
#endif /* CRAY */

#ifdef sgi
struct status *sy_pread(struct io_req *req, struct syscall_info *sysc, int fd,
			char *addr)
{
	int rc;
	struct status *status;

	rc = pread(fd, addr, req->r_data.io.r_nbytes, req->r_data.io.r_offset);

	status = malloc(sizeof(struct status));
	if (status == NULL) {
		doio_fprintf(stderr, "malloc failed, %s/%d\n",
			     __FILE__, __LINE__);
		return NULL;
	}
	status->aioid = NULL;
	status->rval = rc;
	status->err = errno;

	return (status);
}

struct status *sy_pwrite(struct io_req *req, struct syscall_info *sysc, int fd,
			 char *addr)
{
	int rc;
	struct status *status;

	rc = pwrite(fd, addr, req->r_data.io.r_nbytes, req->r_data.io.r_offset);

	status = malloc(sizeof(struct status));
	if (status == NULL) {
		doio_fprintf(stderr, "malloc failed, %s/%d\n",
			     __FILE__, __LINE__);
		return NULL;
	}
	status->aioid = NULL;
	status->rval = rc;
	status->err = errno;

	return (status);
}

char *fmt_pread(struct io_req *req, struct syscall_info *sy, int fd, char *addr)
{
	static char *errbuf = NULL;
	char *cp;

	if (errbuf == NULL) {
		errbuf = malloc(32768);
		if (errbuf == NULL) {
			doio_fprintf(stderr, "malloc failed, %s/%d\n",
				     __FILE__, __LINE__);
			return NULL;
		}
	}

	cp = errbuf;
	cp += sprintf(cp, "syscall:  %s(%d, 0x%lx, %d)\n",
		      sy->sy_name, fd, addr, req->r_data.io.r_nbytes);
	return (errbuf);
}
#endif /* sgi */

#ifndef CRAY
struct status *sy_readv(struct io_req *req, struct syscall_info *sysc, int fd,
			char *addr)
{
	struct status *sy_rwv();
	return sy_rwv(req, sysc, fd, addr, 0);
}

struct status *sy_writev(struct io_req *req, struct syscall_info *sysc, int fd,
			 char *addr)
{
	struct status *sy_rwv();
	return sy_rwv(req, sysc, fd, addr, 1);
}

struct status *sy_rwv(struct io_req *req, struct syscall_info *sysc, int fd,
		      char *addr, int rw)
{
	int rc;
	struct status *status;
	struct iovec iov[2];

	status = malloc(sizeof(struct status));
	if (status == NULL) {
		doio_fprintf(stderr, "malloc failed, %s/%d\n",
			     __FILE__, __LINE__);
		return NULL;
	}
	status->aioid = NULL;

	/* move to the desired file position. */
	if ((rc = lseek(fd, req->r_data.io.r_offset, SEEK_SET)) == -1) {
		status->rval = rc;
		status->err = errno;
		return (status);
	}

	iov[0].iov_base = addr;
	iov[0].iov_len = req->r_data.io.r_nbytes;

	if (rw)
		rc = writev(fd, iov, 1);
	else
		rc = readv(fd, iov, 1);
	status->aioid = NULL;
	status->rval = rc;
	status->err = errno;
	return (status);
}

char *fmt_readv(struct io_req *req, struct syscall_info *sy, int fd, char *addr)
{
	static char errbuf[32768];
	char *cp;

	cp = errbuf;
	cp += sprintf(cp, "syscall:  %s(%d, (iov on stack), 1)\n",
		      sy->sy_name, fd);
	return (errbuf);
}
#endif /* !CRAY */

#ifdef sgi
struct status *sy_aread(struct io_req *req, struct syscall_info *sysc, int fd,
			char *addr)
{
	struct status *sy_arw();
	return sy_arw(req, sysc, fd, addr, 0);
}

struct status *sy_awrite(struct io_req *req, struct syscall_info *sysc, int fd,
			 char *addr)
{
	struct status *sy_arw();
	return sy_arw(req, sysc, fd, addr, 1);
}

/*
  #define sy_aread(A, B, C, D)	sy_arw(A, B, C, D, 0)
  #define sy_awrite(A, B, C, D)	sy_arw(A, B, C, D, 1)
 */

struct status *sy_arw(struct io_req *req, struct syscall_info *sysc, int fd,
		      char *addr, int rw)
{
	/* POSIX 1003.1b-1993 Async read */
	struct status *status;
	int rc;
	int aio_id, aio_strat, signo;
	struct aio_info *aiop;

	status = malloc(sizeof(struct status));
	if (status == NULL) {
		doio_fprintf(stderr, "malloc failed, %s/%d\n",
			     __FILE__, __LINE__);
		return NULL;
	}
	aio_strat = req->r_data.io.r_aio_strat;
	signo = (aio_strat == A_SIGNAL) ? SIGUSR1 : 0;

	aio_id = aio_register(fd, aio_strat, signo);
	aiop = aio_slot(aio_id);

	memset((void *)&aiop->aiocb, 0, sizeof(aiocb_t));

	aiop->aiocb.aio_fildes = fd;
	aiop->aiocb.aio_nbytes = req->r_data.io.r_nbytes;
	aiop->aiocb.aio_offset = req->r_data.io.r_offset;
	aiop->aiocb.aio_buf = addr;
	aiop->aiocb.aio_reqprio = 0;	/* must be 0 */
	aiop->aiocb.aio_lio_opcode = 0;

	if (aio_strat == A_SIGNAL) {	/* siginfo(2) stuff */
		aiop->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL;
		aiop->aiocb.aio_sigevent.sigev_signo = signo;
	} else if (aio_strat == A_CALLBACK) {
		aiop->aiocb.aio_sigevent.sigev_signo = 0;
		aiop->aiocb.aio_sigevent.sigev_notify = SIGEV_CALLBACK;
		aiop->aiocb.aio_sigevent.sigev_func = cb_handler;
		aiop->aiocb.aio_sigevent.sigev_value.sival_int = aio_id;
	} else {
		aiop->aiocb.aio_sigevent.sigev_notify = SIGEV_NONE;
		aiop->aiocb.aio_sigevent.sigev_signo = 0;
	}

	if (rw)
		rc = aio_write(&aiop->aiocb);
	else
		rc = aio_read(&aiop->aiocb);

	status->aioid = malloc(2 * sizeof(int));
	if (status->aioid == NULL) {
		doio_fprintf(stderr, "malloc failed, %s/%d\n",
			     __FILE__, __LINE__);
		return NULL;
	}
	status->aioid[0] = aio_id;
	status->aioid[1] = -1;
	status->rval = rc;
	status->err = errno;
	return (status);
}

char *fmt_aread(struct io_req *req, struct syscall_info *sy, int fd, char *addr)
{
	static char errbuf[32768];
	char *cp;

	cp = errbuf;
	cp += sprintf(cp, "syscall:  %s(&aiop->aiocb)\n", sy->sy_name);
	return (errbuf);
}
#endif /* sgi */

#ifndef CRAY

struct status *sy_mmread(struct io_req *req, struct syscall_info *sysc, int fd,
			 char *addr)
{
	struct status *sy_mmrw();
	return sy_mmrw(req, sysc, fd, addr, 0);
}

struct status *sy_mmwrite(struct io_req *req, struct syscall_info *sysc, int fd,
			  char *addr)
{
	struct status *sy_mmrw();
	return sy_mmrw(req, sysc, fd, addr, 1);
}

struct status *sy_mmrw(struct io_req *req, struct syscall_info *sysc, int fd,
		       char *addr, int rw)
{
	/*
	 * mmap read/write
	 * This version is oriented towards mmaping the file to memory
	 * ONCE and keeping it mapped.
	 */
	struct status *status;
	void *mrc = NULL, *memaddr = NULL;
	struct fd_cache *fdc;
	struct stat sbuf;
	int rc;

	status = malloc(sizeof(struct status));
	if (status == NULL) {
		doio_fprintf(stderr, "malloc failed, %s/%d\n",
			     __FILE__, __LINE__);
		return NULL;
	}
	status->aioid = NULL;
	status->rval = -1;

	fdc = alloc_fdcache(req->r_data.io.r_file, req->r_data.io.r_oflags);

	if (v_opt || fdc->c_memaddr == NULL) {
		if (fstat(fd, &sbuf) < 0) {
			doio_fprintf(stderr, "fstat failed, errno=%d\n", errno);
			status->err = errno;
			return (status);
		}

		fdc->c_memlen = (int)sbuf.st_size;
		mrc = mmap(NULL, (int)sbuf.st_size,
			   rw ? PROT_WRITE | PROT_READ : PROT_READ,
			   MAP_SHARED, fd, 0);

		if (mrc == MAP_FAILED) {
			doio_fprintf(stderr, "mmap() failed - 0x%lx %d\n",
				     mrc, errno);
			status->err = errno;
			return (status);
		}

		fdc->c_memaddr = mrc;
	}

	memaddr = (void *)((char *)fdc->c_memaddr + req->r_data.io.r_offset);

	active_mmap_rw = 1;
	if (rw)
		memcpy(memaddr, addr, req->r_data.io.r_nbytes);
	else
		memcpy(addr, memaddr, req->r_data.io.r_nbytes);
	if (v_opt)
		msync(fdc->c_memaddr, (int)sbuf.st_size, MS_SYNC);
	active_mmap_rw = 0;

	status->rval = req->r_data.io.r_nbytes;
	status->err = 0;

	if (v_opt) {
		rc = munmap(mrc, (int)sbuf.st_size);
	}

	return (status);
}

char *fmt_mmrw(struct io_req *req, struct syscall_info *sy, int fd, char *addr)
{
	static char errbuf[32768];
	char *cp;
	struct fd_cache *fdc;
	void *memaddr;

	fdc = alloc_fdcache(req->r_data.io.r_file, req->r_data.io.r_oflags);

	cp = errbuf;
	cp += sprintf(cp, "syscall:  %s(NULL, %d, %s, MAP_SHARED, %d, 0)\n",
		      sy->sy_name,
		      fdc->c_memlen,
		      (sy->sy_flags & SY_WRITE) ? "PROT_WRITE" : "PROT_READ",
		      fd);

	cp += sprintf(cp, "\tfile is mmaped to: 0x%lx\n",
		      (unsigned long)fdc->c_memaddr);

	memaddr = (void *)((char *)fdc->c_memaddr + req->r_data.io.r_offset);

	cp += sprintf(cp, "\tfile-mem=0x%lx, length=%d, buffer=0x%lx\n",
		      (unsigned long)memaddr, req->r_data.io.r_nbytes,
		      (unsigned long)addr);

	return (errbuf);
}
#endif /* !CRAY */

struct syscall_info syscalls[] = {
#ifdef CRAY
	{"listio-read-sync", LREAD,
	 sy_listio, NULL, fmt_listio,
	 SY_IOSW},
	{"listio-read-strides-sync", LSREAD,
	 sy_listio, listio_mem, fmt_listio,
	 SY_IOSW},
	{"listio-read-reqs-sync", LEREAD,
	 sy_listio, listio_mem, fmt_listio,
	 SY_IOSW},
	{"listio-read-async", LREADA,
	 sy_listio, NULL, fmt_listio,
	 SY_IOSW | SY_ASYNC},
	{"listio-read-strides-async", LSREADA,
	 sy_listio, listio_mem, fmt_listio,
	 SY_IOSW | SY_ASYNC},
	{"listio-read-reqs-async", LEREADA,
	 sy_listio, listio_mem, fmt_listio,
	 SY_IOSW | SY_ASYNC},
	{"listio-write-sync", LWRITE,
	 sy_listio, listio_mem, fmt_listio,
	 SY_IOSW | SY_WRITE},
	{"listio-write-strides-sync", LSWRITE,
	 sy_listio, listio_mem, fmt_listio,
	 SY_IOSW | SY_WRITE},
	{"listio-write-reqs-sync", LEWRITE,
	 sy_listio, listio_mem, fmt_listio,
	 SY_IOSW | SY_WRITE},
	{"listio-write-async", LWRITEA,
	 sy_listio, listio_mem, fmt_listio,
	 SY_IOSW | SY_WRITE | SY_ASYNC},
	{"listio-write-strides-async", LSWRITEA,
	 sy_listio, listio_mem, fmt_listio,
	 SY_IOSW | SY_WRITE | SY_ASYNC},
	{"listio-write-reqs-async", LEWRITEA,
	 sy_listio, listio_mem, fmt_listio,
	 SY_IOSW | SY_WRITE | SY_ASYNC},
#endif

#ifdef sgi
	{"aread", AREAD,
	 sy_aread, NULL, fmt_aread,
	 SY_IOSW | SY_ASYNC},
	{"awrite", AWRITE,
	 sy_awrite, NULL, fmt_aread,
	 SY_IOSW | SY_WRITE | SY_ASYNC},
	{"pread", PREAD,
	 sy_pread, NULL, fmt_pread,
	 0},
	{"pwrite", PWRITE,
	 sy_pwrite, NULL, fmt_pread,
	 SY_WRITE},
#endif

#ifndef CRAY
	{"readv", READV,
	 sy_readv, NULL, fmt_readv,
	 0},
	{"writev", WRITEV,
	 sy_writev, NULL, fmt_readv,
	 SY_WRITE},
	{"mmap-read", MMAPR,
	 sy_mmread, NULL, fmt_mmrw,
	 0},
	{"mmap-write", MMAPW,
	 sy_mmwrite, NULL, fmt_mmrw,
	 SY_WRITE},
#endif

	{NULL, 0,
	 0, 0, 0,
	 0},
};

int do_rw(struct io_req *req)
{
	static int pid = -1;
	int fd, offset, nbytes, nstrides, nents, oflags;
	int rval, mem_needed, i;
	int logged_write, got_lock, pattern;
	off_t woffset;
	int min_byte, max_byte;
	char *addr, *file, *msg;
	struct status *s;
	struct wlog_rec wrec;
	struct syscall_info *sy;
#if defined(CRAY) || defined(sgi)
	struct aio_info *aiop;
	struct iosw *iosw;
#endif
#ifdef sgi
	struct fd_cache *fdc;
#endif

	woffset = 0;

	/*
	 * Initialize common fields - assumes r_oflags, r_file, r_offset, and
	 * r_nbytes are at the same offset in the read_req and reada_req
	 * structures.
	 */
	file = req->r_data.io.r_file;
	oflags = req->r_data.io.r_oflags;
	offset = req->r_data.io.r_offset;
	nbytes = req->r_data.io.r_nbytes;
	nstrides = req->r_data.io.r_nstrides;
	nents = req->r_data.io.r_nent;
	pattern = req->r_data.io.r_pattern;

	if (nents >= MAX_AIO) {
		doio_fprintf(stderr,
			     "do_rw: too many list requests, %d.  Maximum is %d\n",
			     nents, MAX_AIO);
		return (-1);
	}

	/*
	 * look up system call info
	 */
	for (sy = syscalls; sy->sy_name != NULL && sy->sy_type != req->r_type;
	     sy++) ;

	if (sy->sy_name == NULL) {
		doio_fprintf(stderr, "do_rw: unknown r_type %d.\n",
			     req->r_type);
		return (-1);
	}

	/*
	 * Get an open file descriptor
	 * Note: must be done before memory allocation so that the direct i/o
	 *      information is available in mem. allocate
	 */

	if ((fd = alloc_fd(file, oflags)) == -1)
		return -1;

	/*
	 * Allocate core memory and possibly sds space.  Initialize the
	 * data to be written.  Make sure we get enough, based on the
	 * memstride.
	 *
	 * need:
	 *      1 extra word for possible partial-word address "bump"
	 *      1 extra word for dynamic pattern overrun
	 *      MPP_BUMP extra words for T3E non-hw-aligned memory address.
	 */

	if (sy->sy_buffer != NULL) {
		mem_needed = (*sy->sy_buffer) (req, 0, 0, NULL, NULL);
	} else {
		mem_needed = nbytes;
	}

#ifdef CRAY
	if ((rval =
	     alloc_mem(mem_needed + wtob(1) * 2 +
		       MPP_BUMP * sizeof(UINT64_T))) < 0) {
		return rval;
	}
#else
#ifdef sgi
	/* get memory alignment for using DIRECT I/O */
	fdc = alloc_fdcache(file, oflags);

	if ((rval = alloc_mem(mem_needed + wtob(1) * 2 + fdc->c_memalign)) < 0) {
		return rval;
	}
#else
	/* what is !CRAY && !sgi ? */
	if ((rval = alloc_mem(mem_needed + wtob(1) * 2)) < 0) {
		return rval;
	}
#endif /* sgi */
#endif /* CRAY */

	Pattern[0] = pattern;

	/*
	 * Allocate SDS space for backdoor write if desired
	 */

	if (oflags & O_SSD) {
#ifdef CRAY
#ifndef _CRAYMPP
		if (alloc_sds(nbytes) == -1)
			return -1;

		if (sy->sy_flags & SY_WRITE) {
			/*pattern_fill(Memptr, mem_needed, Pattern, Pattern_Length, 0); */
			(*Data_Fill) (Memptr, nbytes, Pattern, Pattern_Length,
				      0);

			if (sswrite((long)Memptr, Sdsptr, btoc(mem_needed)) ==
			    -1) {
				doio_fprintf(stderr,
					     "sswrite(%d, %d, %d) failed:  %s (%d)\n",
					     (long)Memptr, Sdsptr,
					     btoc(mem_needed), SYSERR, errno);
				fflush(stderr);
				return -1;
			}
		}

		addr = (char *)Sdsptr;
#else
		doio_fprintf(stderr,
			     "Invalid O_SSD flag was generated for MPP system\n");
		fflush(stderr);
		return -1;
#endif /* _CRAYMPP */
#else /* CRAY */
		doio_fprintf(stderr,
			     "Invalid O_SSD flag was generated for non-Cray system\n");
		fflush(stderr);
		return -1;
#endif /* CRAY */
	} else {
		addr = Memptr;

		/*
		 * if io is not raw, bump the offset by a random amount
		 * to generate non-word-aligned io.
		 *
		 * On MPP systems, raw I/O must start on an 0x80 byte boundary.
		 * For non-aligned I/O, bump the address from 1 to 8 words.
		 */

		if (!(req->r_data.io.r_uflags & F_WORD_ALIGNED)) {
#ifdef _CRAYMPP
			addr +=
			    random_range(0, MPP_BUMP, 1, NULL) * sizeof(int);
#endif
			addr += random_range(0, wtob(1) - 1, 1, NULL);
		}
#ifdef sgi
		/*
		 * Force memory alignment for Direct I/O
		 */
		if ((oflags & O_DIRECT) && ((long)addr % fdc->c_memalign != 0)) {
			addr +=
			    fdc->c_memalign - ((long)addr % fdc->c_memalign);
		}
#endif

		/*
		 * FILL must be done on a word-aligned buffer.
		 * Call the fill function with Memptr which is aligned,
		 * then memmove it to the right place.
		 */
		if (sy->sy_flags & SY_WRITE) {
			(*Data_Fill) (Memptr, mem_needed, Pattern,
				      Pattern_Length, 0);
			if (addr != Memptr)
				memmove(addr, Memptr, mem_needed);
		}
	}

	rval = 0;
	got_lock = 0;
	logged_write = 0;

	/*
	 * Lock data if this is a write and locking option is set
	 */
	if (sy->sy_flags & SY_WRITE && k_opt) {
		if (sy->sy_buffer != NULL) {
			(*sy->sy_buffer) (req, offset, 0, &min_byte, &max_byte);
		} else {
			min_byte = offset;
			max_byte = offset + (nbytes * nstrides * nents);
		}

		if (lock_file_region(file, fd, F_WRLCK,
				     min_byte, (max_byte - min_byte + 1)) < 0) {
			doio_fprintf(stderr,
				     "file lock failed:\n%s\n",
				     fmt_ioreq(req, sy, fd));
			doio_fprintf(stderr,
				     "          buffer(req, %d, 0, 0x%x, 0x%x)\n",
				     offset, min_byte, max_byte);
			alloc_mem(-1);
			exit(E_INTERNAL);
		}

		got_lock = 1;
	}

	/*
	 * Write a preliminary write-log entry.  This is done so that
	 * doio_check can do corruption detection across an interrupt/crash.
	 * Note that w_done is set to 0.  If doio_check sees this, it
	 * re-creates the file extents as if the write completed, but does not
	 * do any checking - see comments in doio_check for more details.
	 */

	if (sy->sy_flags & SY_WRITE && w_opt) {
		if (pid == -1) {
			pid = getpid();
		}

		wrec.w_async = (sy->sy_flags & SY_ASYNC) ? 1 : 0;
		wrec.w_oflags = oflags;
		wrec.w_pid = pid;
		wrec.w_offset = offset;
		wrec.w_nbytes = nbytes;	/* mem_needed -- total length */

		wrec.w_pathlen = strlen(file);
		memcpy(wrec.w_path, file, wrec.w_pathlen);
		wrec.w_hostlen = strlen(Host);
		memcpy(wrec.w_host, Host, wrec.w_hostlen);
		wrec.w_patternlen = Pattern_Length;
		memcpy(wrec.w_pattern, Pattern, wrec.w_patternlen);

		wrec.w_done = 0;

		if ((woffset = wlog_record_write(&Wlog, &wrec, -1)) == -1) {
			doio_fprintf(stderr,
				     "Could not append to write-log:  %s (%d)\n",
				     SYSERR, errno);
		} else {
			logged_write = 1;
		}
	}

	s = (*sy->sy_syscall) (req, sy, fd, addr);

	if (s->rval == -1) {
		doio_fprintf(stderr,
			     "%s() request failed:  %s (%d)\n%s\n%s\n",
			     sy->sy_name, SYSERR, errno,
			     fmt_ioreq(req, sy, fd),
			     (*sy->sy_format) (req, sy, fd, addr));

		doio_upanic(U_RVAL);

		for (i = 0; i < nents; i++) {
			if (s->aioid == NULL)
				break;
			aio_unregister(s->aioid[i]);
		}
		rval = -1;
	} else {
		/*
		 * If the syscall was async, wait for I/O to complete
		 */
#ifndef __linux__
		if (sy->sy_flags & SY_ASYNC) {
			for (i = 0; i < nents; i++) {
				aio_wait(s->aioid[i]);
			}
		}
#endif

		/*
		 * Check the syscall how-much-data-written return.  Look
		 * for this in either the return value or the 'iosw'
		 * structure.
		 */

		if (sy->sy_flags & SY_IOSW) {
#ifdef CRAY
			for (i = 0; i < nents; i++) {
				if (s->aioid == NULL)
					break;	/* >>> error condition? */
				aiop = aio_slot(s->aioid[i]);
				iosw = &aiop->iosw;
				if (iosw->sw_error != 0) {
					doio_fprintf(stderr,
						     "%s() iosw error set: %s\n%s\n%s\n",
						     sy->sy_name,
						     strerror(iosw->sw_error),
						     fmt_ioreq(req, sy, fd),
						     (*sy->sy_format) (req, sy,
								       fd,
								       addr));
					doio_upanic(U_IOSW);
					rval = -1;
				} else if (iosw->sw_count != nbytes * nstrides) {
					doio_fprintf(stderr,
						     "Bad iosw from %s() #%d\nExpected (%d,%d,%d), got (%d,%d,%d)\n%s\n%s\n",
						     sy->sy_name, i,
						     1, 0, nbytes * nstrides,
						     iosw->sw_flag,
						     iosw->sw_error,
						     iosw->sw_count,
						     fmt_ioreq(req, sy, fd),
						     (*sy->sy_format) (req, sy,
								       fd,
								       addr));
					doio_upanic(U_IOSW);
					rval = -1;
				}

				aio_unregister(s->aioid[i]);
			}
#endif /* CRAY */
#ifdef sgi
			for (i = 0; s->aioid[i] != -1; i++) {
				if (s->aioid == NULL) {
					doio_fprintf(stderr,
						     "aioid == NULL!\n");
					break;
				}
				aiop = aio_slot(s->aioid[i]);

				/*
				 * make sure the io completed without error
				 */
				if (aiop->aio_errno != 0) {
					doio_fprintf(stderr,
						     "%s() aio error set: %s (%d)\n%s\n%s\n",
						     sy->sy_name,
						     strerror(aiop->aio_errno),
						     aiop->aio_errno,
						     fmt_ioreq(req, sy, fd),
						     (*sy->sy_format) (req, sy,
								       fd,
								       addr));
					doio_upanic(U_IOSW);
					rval = -1;
				} else if (aiop->aio_ret != nbytes) {
					doio_fprintf(stderr,
						     "Bad aio return from %s() #%d\nExpected (%d,%d), got (%d,%d)\n%s\n%s\n",
						     sy->sy_name, i,
						     0, nbytes,
						     aiop->aio_errno,
						     aiop->aio_ret,
						     fmt_ioreq(req, sy, fd),
						     (*sy->sy_format) (req, sy,
								       fd,
								       addr));
					aio_unregister(s->aioid[i]);
					doio_upanic(U_IOSW);
					return -1;
				} else {
					aio_unregister(s->aioid[i]);
					rval = 0;
				}
			}
#endif /* sgi */
		} else {

			if (s->rval != mem_needed) {
				doio_fprintf(stderr,
					     "%s() request returned wrong # of bytes - expected %d, got %d\n%s\n%s\n",
					     sy->sy_name, nbytes, s->rval,
					     fmt_ioreq(req, sy, fd),
					     (*sy->sy_format) (req, sy, fd,
							       addr));
				rval = -1;
				doio_upanic(U_RVAL);
			}
		}
	}

	/*
	 * Verify that the data was written correctly - check_file() returns
	 * a non-null pointer which contains an error message if there are
	 * problems.
	 */

	if (rval == 0 && sy->sy_flags & SY_WRITE && v_opt) {
		msg = check_file(file, offset, nbytes * nstrides * nents,
				 Pattern, Pattern_Length, 0,
				 oflags & O_PARALLEL);
		if (msg != NULL) {
			doio_fprintf(stderr, "%s\n%s\n%s\n",
				     msg,
				     fmt_ioreq(req, sy, fd),
				     (*sy->sy_format) (req, sy, fd, addr));
			doio_upanic(U_CORRUPTION);
			exit(E_COMPARE);
		}
	}

	/*
	 * General cleanup ...
	 *
	 * Write extent information to the write-log, so that doio_check can do
	 * corruption detection.  Note that w_done is set to 1, indicating that
	 * the write has been verified as complete.  We don't need to write the
	 * filename on the second logging.
	 */

	if (w_opt && logged_write) {
		wrec.w_done = 1;
		wlog_record_write(&Wlog, &wrec, woffset);
	}

	/*
	 * Unlock file region if necessary
	 */

	if (got_lock) {
		if (lock_file_region(file, fd, F_UNLCK,
				     min_byte, (max_byte - min_byte + 1)) < 0) {
			alloc_mem(-1);
			exit(E_INTERNAL);
		}
	}

	if (s->aioid != NULL)
		free(s->aioid);
	free(s);
	return (rval == -1) ? -1 : 0;
}

/*
 * fcntl-based requests
 *   - F_FRESVSP
 *   - F_UNRESVSP
 *   - F_FSYNC
 */
#ifdef sgi
int do_fcntl(struct io_req *req)
{
	int fd, oflags, offset, nbytes;
	int rval, op;
	int got_lock;
	int min_byte, max_byte;
	char *file, *msg;
	struct flock flk;

	/*
	 * Initialize common fields - assumes r_oflags, r_file, r_offset, and
	 * r_nbytes are at the same offset in the read_req and reada_req
	 * structures.
	 */
	file = req->r_data.io.r_file;
	oflags = req->r_data.io.r_oflags;
	offset = req->r_data.io.r_offset;
	nbytes = req->r_data.io.r_nbytes;

	flk.l_type = 0;
	flk.l_whence = SEEK_SET;
	flk.l_start = offset;
	flk.l_len = nbytes;

	/*
	 * Get an open file descriptor
	 */

	if ((fd = alloc_fd(file, oflags)) == -1)
		return -1;

	rval = 0;
	got_lock = 0;

	/*
	 * Lock data if this is locking option is set
	 */
	if (k_opt) {
		min_byte = offset;
		max_byte = offset + nbytes;

		if (lock_file_region(file, fd, F_WRLCK,
				     min_byte, (nbytes + 1)) < 0) {
			doio_fprintf(stderr, "file lock failed:\n");
			doio_fprintf(stderr,
				     "          buffer(req, %d, 0, 0x%x, 0x%x)\n",
				     offset, min_byte, max_byte);
			alloc_mem(-1);
			exit(E_INTERNAL);
		}

		got_lock = 1;
	}

	switch (req->r_type) {
	case RESVSP:
		op = F_RESVSP;
		msg = "f_resvsp";
		break;
	case UNRESVSP:
		op = F_UNRESVSP;
		msg = "f_unresvsp";
		break;
#ifdef F_FSYNC
	case DFFSYNC:
		op = F_FSYNC;
		msg = "f_fsync";
		break;
#endif
	}

	rval = fcntl(fd, op, &flk);

	if (rval == -1) {
		doio_fprintf(stderr,
			     "fcntl %s request failed: %s (%d)\n\tfcntl(%d, %s %d, {%d %lld ==> %lld}\n",
			     msg, SYSERR, errno,
			     fd, msg, op, flk.l_whence,
			     (long long)flk.l_start, (long long)flk.l_len);

		doio_upanic(U_RVAL);
		rval = -1;
	}

	/*
	 * Unlock file region if necessary
	 */

	if (got_lock) {
		if (lock_file_region(file, fd, F_UNLCK,
				     min_byte, (max_byte - min_byte + 1)) < 0) {
			alloc_mem(-1);
			exit(E_INTERNAL);
		}
	}

	return (rval == -1) ? -1 : 0;
}
#endif /* sgi */

/*
 *  fsync(2) and fdatasync(2)
 */
#ifndef CRAY
int do_sync(struct io_req *req)
{
	int fd, oflags;
	int rval;
	char *file;

	/*
	 * Initialize common fields - assumes r_oflags, r_file, r_offset, and
	 * r_nbytes are at the same offset in the read_req and reada_req
	 * structures.
	 */
	file = req->r_data.io.r_file;
	oflags = req->r_data.io.r_oflags;

	/*
	 * Get an open file descriptor
	 */

	if ((fd = alloc_fd(file, oflags)) == -1)
		return -1;

	rval = 0;
	switch (req->r_type) {
	case FSYNC2:
		rval = fsync(fd);
		break;
	case FDATASYNC:
		rval = fdatasync(fd);
		break;
	default:
		rval = -1;
	}
	return (rval == -1) ? -1 : 0;
}
#endif /* !CRAY */

int
doio_pat_fill(char *addr, int mem_needed, char *Pattern, int Pattern_Length,
	      int shift)
{
	return pattern_fill(addr, mem_needed, Pattern, Pattern_Length, 0);
}

char *doio_pat_check(char *buf, int offset, int length, char *pattern,
		     int pattern_length, int patshift)
{
	static char errbuf[4096];
	int nb, i, pattern_index;
	char *cp, *bufend, *ep;
	char actual[33], expected[33];

	if (pattern_check(buf, length, pattern, pattern_length, patshift) != 0) {
		ep = errbuf;
		ep +=
		    sprintf(ep,
			    "Corrupt regions follow - unprintable chars are represented as '.'\n");
		ep +=
		    sprintf(ep,
			    "-----------------------------------------------------------------\n");

		pattern_index = patshift % pattern_length;;
		cp = buf;
		bufend = buf + length;

		while (cp < bufend) {
			if (*cp != pattern[pattern_index]) {
				nb = bufend - cp;
				if ((unsigned int)nb > sizeof(expected) - 1) {
					nb = sizeof(expected) - 1;
				}

				ep +=
				    sprintf(ep,
					    "corrupt bytes starting at file offset %d\n",
					    offset + (int)(cp - buf));

				/*
				 * Fill in the expected and actual patterns
				 */
				memset(expected, 0x00, sizeof(expected));
				memset(actual, 0x00, sizeof(actual));

				for (i = 0; i < nb; i++) {
					expected[i] =
					    pattern[(pattern_index +
						     i) % pattern_length];
					if (!isprint(expected[i])) {
						expected[i] = '.';
					}

					actual[i] = cp[i];
					if (!isprint(actual[i])) {
						actual[i] = '.';
					}
				}

				ep +=
				    sprintf(ep,
					    "    1st %2d expected bytes:  %s\n",
					    nb, expected);
				ep +=
				    sprintf(ep,
					    "    1st %2d actual bytes:    %s\n",
					    nb, actual);
				fflush(stderr);
				return errbuf;
			} else {
				cp++;
				pattern_index++;

				if (pattern_index == pattern_length) {
					pattern_index = 0;
				}
			}
		}
		return errbuf;
	}

	return NULL;
}

/*
 * Check the contents of a file beginning at offset, for length bytes.  It
 * is assumed that there is a string of pattern bytes in this area of the
 * file.  Use normal buffered reads to do the verification.
 *
 * If there is a data mismatch, write a detailed message into a static buffer
 * suitable for the caller to print.  Otherwise print NULL.
 *
 * The fsa flag is set to non-zero if the buffer should be read back through
 * the FSA (unicos/mk).  This implies the file will be opened
 * O_PARALLEL|O_RAW|O_WELLFORMED to do the validation.  We must do this because
 * FSA will not allow the file to be opened for buffered io if it was
 * previously opened for O_PARALLEL io.
 */

char *check_file(char *file, int offset, int length, char *pattern,
		 int pattern_length, int patshift, int fsa)
{
	static char errbuf[4096];
	int fd, nb, flags;
	char *buf, *em, *ep;
#ifdef sgi
	struct fd_cache *fdc;
#endif

	buf = Memptr;

	if (V_opt) {
		flags = Validation_Flags | O_RDONLY;
	} else {
		flags = O_RDONLY;
		if (fsa) {
#ifdef CRAY
			flags |= O_PARALLEL | O_RAW | O_WELLFORMED;
#endif
		}
	}

	if ((fd = alloc_fd(file, flags)) == -1) {
		sprintf(errbuf,
			"Could not open file %s with flags %#o (%s) for data comparison:  %s (%d)\n",
			file, flags, format_oflags(flags), SYSERR, errno);
		return errbuf;
	}

	if (lseek(fd, offset, SEEK_SET) == -1) {
		sprintf(errbuf,
			"Could not lseek to offset %d in %s for verification:  %s (%d)\n",
			offset, file, SYSERR, errno);
		return errbuf;
	}
#ifdef sgi
	/* Irix: Guarantee a properly aligned address on Direct I/O */
	fdc = alloc_fdcache(file, flags);
	if ((flags & O_DIRECT) && ((long)buf % fdc->c_memalign != 0)) {
		buf += fdc->c_memalign - ((long)buf % fdc->c_memalign);
	}
#endif

	if ((nb = read(fd, buf, length)) == -1) {
#ifdef sgi
		sprintf(errbuf,
			"Could not read %d bytes from %s for verification:  %s (%d)\n\tread(%d, 0x%lx, %d)\n\tbuf %% alignment(%d) = %ld\n",
			length, file, SYSERR, errno,
			fd, buf, length,
			fdc->c_memalign, (long)buf % fdc->c_memalign);
#else
		sprintf(errbuf,
			"Could not read %d bytes from %s for verification:  %s (%d)\n",
			length, file, SYSERR, errno);

#endif
		return errbuf;
	}

	if (nb != length) {
		sprintf(errbuf,
			"Read wrong # bytes from %s.  Expected %d, got %d\n",
			file, length, nb);
		return errbuf;
	}

	if ((em =
	     (*Data_Check) (buf, offset, length, pattern, pattern_length,
			    patshift)) != NULL) {
		ep = errbuf;
		ep += sprintf(ep, "*** DATA COMPARISON ERROR ***\n");
		ep +=
		    sprintf(ep, "check_file(%s, %d, %d, %s, %d, %d) failed\n\n",
			    file, offset, length, pattern, pattern_length,
			    patshift);
		ep +=
		    sprintf(ep, "Comparison fd is %d, with open flags %#o\n",
			    fd, flags);
		strcpy(ep, em);
		return (errbuf);
	}
	return NULL;
}

/*
 * Function to single-thread stdio output.
 */

int doio_fprintf(FILE * stream, char *format, ...)
{
	static int pid = -1;
	char *date;
	int rval;
	struct flock flk;
	va_list arglist;
	struct timeval ts;
	gettimeofday(&ts, NULL);
	date = hms(ts.tv_sec);

	if (pid == -1) {
		pid = getpid();
	}

	flk.l_whence = flk.l_start = flk.l_len = 0;
	flk.l_type = F_WRLCK;
	fcntl(fileno(stream), F_SETLKW, &flk);

	va_start(arglist, format);
	rval = fprintf(stream, "\n%s%s (%5d) %s\n", Prog, TagName, pid, date);
	rval += fprintf(stream, "---------------------\n");
	vfprintf(stream, format, arglist);
	va_end(arglist);

	fflush(stream);

	flk.l_type = F_UNLCK;
	fcntl(fileno(stream), F_SETLKW, &flk);

	return rval;
}

/*
 * Simple function for allocating core memory.  Uses Memsize and Memptr to
 * keep track of the current amount allocated.
 */
#ifndef CRAY
int alloc_mem(int nbytes)
{
	char *cp;
	void *addr;
	int me = 0, flags, key, shmid;
	static int mturn = 0;	/* which memory type to use */
	struct memalloc *M;
	char filename[255];
#ifdef __linux__
	struct shmid_ds shm_ds;
#endif

#ifdef __linux__
	memset(&shm_ds, 0x00, sizeof(struct shmid_ds));
#endif

	/* nbytes = -1 means "free all allocated memory" */
	if (nbytes == -1) {

		for (me = 0; me < Nmemalloc; me++) {
			if (Memalloc[me].space == NULL)
				continue;

			switch (Memalloc[me].memtype) {
			case MEM_DATA:
#ifdef sgi
				if (Memalloc[me].flags & MEMF_MPIN)
					munpin(Memalloc[me].space,
					       Memalloc[me].size);
#endif
				free(Memalloc[me].space);
				Memalloc[me].space = NULL;
				Memptr = NULL;
				Memsize = 0;
				break;
			case MEM_SHMEM:
#ifdef sgi
				if (Memalloc[me].flags & MEMF_MPIN)
					munpin(Memalloc[me].space,
					       Memalloc[me].size);
#endif
				shmdt(Memalloc[me].space);
				Memalloc[me].space = NULL;
#ifdef sgi
				shmctl(Memalloc[me].fd, IPC_RMID);
#else
				shmctl(Memalloc[me].fd, IPC_RMID, &shm_ds);
#endif
				break;
			case MEM_MMAP:
#ifdef sgi
				if (Memalloc[me].flags & MEMF_MPIN)
					munpin(Memalloc[me].space,
					       Memalloc[me].size);
#endif
				munmap(Memalloc[me].space, Memalloc[me].size);
				close(Memalloc[me].fd);
				if (Memalloc[me].flags & MEMF_FILE) {
					unlink(Memalloc[me].name);
				}
				Memalloc[me].space = NULL;
				break;
			default:
				doio_fprintf(stderr,
					     "alloc_mem: HELP! Unknown memory space type %d index %d\n",
					     Memalloc[me].memtype, me);
				break;
			}
		}
		return 0;
	}

	/*
	 * Select a memory area (currently round-robbin)
	 */

	if (mturn >= Nmemalloc)
		mturn = 0;

	M = &Memalloc[mturn];

	switch (M->memtype) {
	case MEM_DATA:
		if (nbytes > M->size) {
			if (M->space != NULL) {
#ifdef sgi
				if (M->flags & MEMF_MPIN)
					munpin(M->space, M->size);
#endif
				free(M->space);
			}
			M->space = NULL;
			M->size = 0;
		}

		if (M->space == NULL) {
			if ((cp = malloc(nbytes)) == NULL) {
				doio_fprintf(stderr,
					     "malloc(%d) failed:  %s (%d)\n",
					     nbytes, SYSERR, errno);
				return -1;
			}
#ifdef sgi
			if (M->flags & MEMF_MPIN) {
				if (mpin(cp, nbytes) == -1) {
					doio_fprintf(stderr,
						     "mpin(0x%lx, %d) failed:  %s (%d)\n",
						     cp, nbytes, SYSERR, errno);
				}
			}
#endif
			M->space = (void *)cp;
			M->size = nbytes;
		}
		break;

	case MEM_MMAP:
		if (nbytes > M->size) {
			if (M->space != NULL) {
#ifdef sgi
				if (M->flags & MEMF_MPIN)
					munpin(M->space, M->size);
#endif
				munmap(M->space, M->size);
				close(M->fd);
				if (M->flags & MEMF_FILE)
					unlink(M->name);
			}
			M->space = NULL;
			M->size = 0;
		}

		if (M->space == NULL) {
			if (strchr(M->name, '%')) {
				sprintf(filename, M->name, getpid());
				M->name = strdup(filename);
			}

			if ((M->fd =
			     open(M->name, O_CREAT | O_RDWR, 0666)) == -1) {
				doio_fprintf(stderr,
					     "alloc_mmap: error %d (%s) opening '%s'\n",
					     errno, SYSERR, M->name);
				return (-1);
			}

			addr = NULL;
			flags = 0;
			M->size = nbytes * 4;

			/* bias addr if MEMF_ADDR | MEMF_FIXADDR */
			/* >>> how to pick a memory address? */

			/* bias flags on MEMF_PRIVATE etc */
			if (M->flags & MEMF_PRIVATE)
				flags |= MAP_PRIVATE;
#ifdef sgi
			if (M->flags & MEMF_LOCAL)
				flags |= MAP_LOCAL;
			if (M->flags & MEMF_AUTORESRV)
				flags |= MAP_AUTORESRV;
			if (M->flags & MEMF_AUTOGROW)
				flags |= MAP_AUTOGROW;
#endif
			if (M->flags & MEMF_SHARED)
				flags |= MAP_SHARED;

/*printf("alloc_mem, about to mmap, fd=%d, name=(%s)\n", M->fd, M->name);*/
			if ((M->space = mmap(addr, M->size,
					     PROT_READ | PROT_WRITE,
					     flags, M->fd, 0))
			    == MAP_FAILED) {
				doio_fprintf(stderr,
					     "alloc_mem: mmap error. errno %d (%s)\n\tmmap(addr 0x%x, size %d, read|write 0x%x, mmap flags 0x%x [%#o], fd %d, 0)\n\tfile %s\n",
					     errno, SYSERR, addr, M->size,
					     PROT_READ | PROT_WRITE, flags,
					     M->flags, M->fd, M->name);
				doio_fprintf(stderr, "\t%s%s%s%s%s",
					     (flags & MAP_PRIVATE) ? "private "
					     : "",
#ifdef sgi
					     (flags & MAP_LOCAL) ? "local " :
					     "",
					     (flags & MAP_AUTORESRV) ?
					     "autoresrv " : "",
					     (flags & MAP_AUTOGROW) ?
					     "autogrow " : "",
#endif
					     (flags & MAP_SHARED) ? "shared" :
					     "");
				return (-1);
			}
		}
		break;

	case MEM_SHMEM:
		if (nbytes > M->size) {
			if (M->space != NULL) {
#ifdef sgi
				if (M->flags & MEMF_MPIN)
					munpin(M->space, M->size);
#endif
				shmdt(M->space);
#ifdef sgi
				shmctl(M->fd, IPC_RMID);
#else
				shmctl(M->fd, IPC_RMID, &shm_ds);
#endif
			}
			M->space = NULL;
			M->size = 0;
		}

		if (M->space == NULL) {
			if (!strcmp(M->name, "private")) {
				key = IPC_PRIVATE;
			} else {
				sscanf(M->name, "%i", &key);
			}

			M->size = M->nblks ? M->nblks * 512 : nbytes;

			if (nbytes > M->size) {
#ifdef DEBUG
				doio_fprintf(stderr,
					     "MEM_SHMEM: nblks(%d) too small:  nbytes=%d  Msize=%d, skipping this req.\n",
					     M->nblks, nbytes, M->size);
#endif
				return SKIP_REQ;
			}

			shmid = shmget(key, M->size, IPC_CREAT | 0666);
			if (shmid == -1) {
				doio_fprintf(stderr,
					     "shmget(0x%x, %d, CREAT) failed: %s (%d)\n",
					     key, M->size, SYSERR, errno);
				return (-1);
			}
			M->fd = shmid;
			M->space = shmat(shmid, NULL, SHM_RND);
			if (M->space == (void *)-1) {
				doio_fprintf(stderr,
					     "shmat(0x%x, NULL, SHM_RND) failed: %s (%d)\n",
					     shmid, SYSERR, errno);
				return (-1);
			}
#ifdef sgi
			if (M->flags & MEMF_MPIN) {
				if (mpin(M->space, M->size) == -1) {
					doio_fprintf(stderr,
						     "mpin(0x%lx, %d) failed:  %s (%d)\n",
						     M->space, M->size, SYSERR,
						     errno);
				}
			}
#endif
		}
		break;

	default:
		doio_fprintf(stderr,
			     "alloc_mem: HELP! Unknown memory space type %d index %d\n",
			     Memalloc[me].memtype, mturn);
		break;
	}

	Memptr = M->space;
	Memsize = M->size;

	mturn++;
	return 0;
}
#else /* CRAY */
int alloc_mem(int nbytes)
{
	char *cp;
	int ip;
	static char *malloc_space;

	/*
	 * The "unicos" version of this did some stuff with sbrk;
	 * this caused problems with async I/O on irix, and now appears
	 * to be causing problems with FSA I/O on unicos/mk.
	 */
#ifdef NOTDEF
	if (nbytes > Memsize) {
		if ((cp = (char *)sbrk(nbytes - Memsize)) == (char *)-1) {
			doio_fprintf(stderr, "sbrk(%d) failed:  %s (%d)\n",
				     nbytes - Memsize, SYSERR, errno);
			return -1;
		}

		if (Memsize == 0)
			Memptr = cp;
		Memsize += nbytes - Memsize;
	}
#else

	/* nbytes = -1 means "free all allocated memory" */
	if (nbytes == -1) {
		free(malloc_space);
		Memptr = NULL;
		Memsize = 0;
		return 0;
	}

	if (nbytes > Memsize) {
		if (Memsize != 0)
			free(malloc_space);

		if ((cp = malloc_space = malloc(nbytes)) == NULL) {
			doio_fprintf(stderr, "malloc(%d) failed:  %s (%d)\n",
				     nbytes, SYSERR, errno);
			return -1;
		}
#ifdef _CRAYT3E
		/* T3E requires memory to be aligned on 0x40 word boundaries */
		ip = (int)cp;
		if (ip & 0x3F != 0) {
			doio_fprintf(stderr,
				     "malloc(%d) = 0x%x(0x%x) not aligned by 0x%x\n",
				     nbytes, cp, ip, ip & 0x3f);

			free(cp);
			if ((cp = malloc_space = malloc(nbytes + 0x40)) == NULL) {
				doio_fprintf(stderr,
					     "malloc(%d) failed:  %s (%d)\n",
					     nbytes, SYSERR, errno);
				return -1;
			}
			ip = (int)cp;
			cp += (0x40 - (ip & 0x3F));
		}
#endif /* _CRAYT3E */
		Memptr = cp;
		Memsize = nbytes;
	}
#endif /* NOTDEF */
	return 0;
}
#endif /* CRAY */

/*
 * Simple function for allocating sds space.  Uses Sdssize and Sdsptr to
 * keep track of location and size of currently allocated chunk.
 */

#ifdef _CRAY1

int alloc_sds(int nbytes)
{
	int nblks;

	if (nbytes > Sdssize) {
		if ((nblks = ssbreak(btoc(nbytes - Sdssize))) == -1) {
			doio_fprintf(stderr, "ssbreak(%d) failed:  %s (%d)\n",
				     btoc(nbytes - Sdssize), SYSERR, errno);
			return -1;
		}

		Sdssize = ctob(nblks);
		Sdsptr = 0;
	}

	return 0;
}

#else

#ifdef CRAY

int alloc_sds(int nbytes)
{
	doio_fprintf(stderr,
		     "Internal Error - alloc_sds() called on a CRAY2 system\n");
	alloc_mem(-1);
	exit(E_INTERNAL);
}

#endif

#endif /* _CRAY1 */

/*
 * Function to maintain a file descriptor cache, so that doio does not have
 * to do so many open() and close() calls.  Descriptors are stored in the
 * cache by file name, and open flags.  Each entry also has a _rtc value
 * associated with it which is used in aging.  If doio cannot open a file
 * because it already has too many open (ie. system limit hit) it will close
 * the one in the cache that has the oldest _rtc value.
 *
 * If alloc_fd() is called with a file of NULL, it will close all descriptors
 * in the cache, and free the memory in the cache.
 */

int alloc_fd(char *file, int oflags)
{
	struct fd_cache *fdc;
	struct fd_cache *alloc_fdcache(char *file, int oflags);

	fdc = alloc_fdcache(file, oflags);
	if (fdc != NULL)
		return (fdc->c_fd);
	else
		return (-1);
}

struct fd_cache *alloc_fdcache(char *file, int oflags)
{
	int fd;
	struct fd_cache *free_slot, *oldest_slot, *cp;
	static int cache_size = 0;
	static struct fd_cache *cache = NULL;
#ifdef sgi
	struct dioattr finfo;
#endif

	/*
	 * If file is NULL, it means to free up the fd cache.
	 */

	if (file == NULL && cache != NULL) {
		for (cp = cache; cp < &cache[cache_size]; cp++) {
			if (cp->c_fd != -1) {
				close(cp->c_fd);
			}
#ifndef CRAY
			if (cp->c_memaddr != NULL) {
				munmap(cp->c_memaddr, cp->c_memlen);
			}
#endif
		}

		free(cache);
		cache = NULL;
		cache_size = 0;
		return 0;
	}

	free_slot = NULL;
	oldest_slot = NULL;

	/*
	 * Look for a fd in the cache.  If one is found, return it directly.
	 * Otherwise, when this loop exits, oldest_slot will point to the
	 * oldest fd slot in the cache, and free_slot will point to an
	 * unoccupied slot if there are any.
	 */

	for (cp = cache; cp != NULL && cp < &cache[cache_size]; cp++) {
		if (cp->c_fd != -1 &&
		    cp->c_oflags == oflags && strcmp(cp->c_file, file) == 0) {
#ifdef CRAY
			cp->c_rtc = _rtc();
#else
			cp->c_rtc = Reqno;
#endif
			return cp;
		}

		if (cp->c_fd == -1) {
			if (free_slot == NULL) {
				free_slot = cp;
			}
		} else {
			if (oldest_slot == NULL ||
			    cp->c_rtc < oldest_slot->c_rtc) {
				oldest_slot = cp;
			}
		}
	}

	/*
	 * No matching file/oflags pair was found in the cache.  Attempt to
	 * open a new fd.
	 */

	if ((fd = open(file, oflags, 0666)) < 0) {
		if (errno != EMFILE) {
			doio_fprintf(stderr,
				     "Could not open file %s with flags %#o (%s): %s (%d)\n",
				     file, oflags, format_oflags(oflags),
				     SYSERR, errno);
			alloc_mem(-1);
			exit(E_SETUP);
		}

		/*
		 * If we get here, we have as many open fd's as we can have.
		 * Close the oldest one in the cache (pointed to by
		 * oldest_slot), and attempt to re-open.
		 */

		close(oldest_slot->c_fd);
		oldest_slot->c_fd = -1;
		free_slot = oldest_slot;

		if ((fd = open(file, oflags, 0666)) < 0) {
			doio_fprintf(stderr,
				     "Could not open file %s with flags %#o (%s):  %s (%d)\n",
				     file, oflags, format_oflags(oflags),
				     SYSERR, errno);
			alloc_mem(-1);
			exit(E_SETUP);
		}
	}

/*printf("alloc_fd: new file %s flags %#o fd %d\n", file, oflags, fd);*/

	/*
	 * If we get here, fd is our open descriptor.  If free_slot is NULL,
	 * we need to grow the cache, otherwise free_slot is the slot that
	 * should hold the fd info.
	 */

	if (free_slot == NULL) {
		cache =
		    (struct fd_cache *)realloc(cache,
					       sizeof(struct fd_cache) *
					       (FD_ALLOC_INCR + cache_size));
		if (cache == NULL) {
			doio_fprintf(stderr,
				     "Could not malloc() space for fd chace");
			alloc_mem(-1);
			exit(E_SETUP);
		}

		cache_size += FD_ALLOC_INCR;

		for (cp = &cache[cache_size - FD_ALLOC_INCR];
		     cp < &cache[cache_size]; cp++) {
			cp->c_fd = -1;
		}

		free_slot = &cache[cache_size - FD_ALLOC_INCR];
	}

	/*
	 * finally, fill in the cache slot info
	 */

	free_slot->c_fd = fd;
	free_slot->c_oflags = oflags;
	strcpy(free_slot->c_file, file);
#ifdef CRAY
	free_slot->c_rtc = _rtc();
#else
	free_slot->c_rtc = Reqno;
#endif

#ifdef sgi
	if (oflags & O_DIRECT) {
		if (fcntl(fd, F_DIOINFO, &finfo) == -1) {
			finfo.d_mem = 1;
			finfo.d_miniosz = 1;
			finfo.d_maxiosz = 1;
		}
	} else {
		finfo.d_mem = 1;
		finfo.d_miniosz = 1;
		finfo.d_maxiosz = 1;
	}

	free_slot->c_memalign = finfo.d_mem;
	free_slot->c_miniosz = finfo.d_miniosz;
	free_slot->c_maxiosz = finfo.d_maxiosz;
#endif /* sgi */
#ifndef CRAY
	free_slot->c_memaddr = NULL;
	free_slot->c_memlen = 0;
#endif

	return free_slot;
}

/*
 *
 *			Signal Handling Section
 *
 *
 */

#ifdef sgi
/*
 * "caller-id" for signals
 */
void signal_info(int sig, siginfo_t * info, void *v)
{
	int haveit = 0;

	if (info != NULL) {
		switch (info->si_code) {
		case SI_USER:
			doio_fprintf(stderr,
				     "signal_info: si_signo %d si_errno %d si_code SI_USER pid %d uid %d\n",
				     info->si_signo, info->si_errno,
				     info->si_pid, info->si_uid);
			haveit = 1;
			break;

		case SI_QUEUE:
			doio_fprintf(stderr,
				     "signal_info  si_signo %d si_code = SI_QUEUE\n",
				     info->si_signo);
			haveit = 1;
			break;
		}

		if (!haveit) {
			if ((info->si_signo == SIGSEGV) ||
			    (info->si_signo == SIGBUS)) {
				doio_fprintf(stderr,
					     "signal_info  si_signo %d si_errno %d si_code = %d  si_addr=%p  active_mmap_rw=%d havesigint=%d\n",
					     info->si_signo, info->si_errno,
					     info->si_code, info->si_addr,
					     active_mmap_rw, havesigint);
				haveit = 1;
			}
		}

		if (!haveit) {
			doio_fprintf(stderr,
				     "signal_info: si_signo %d si_errno %d unknown code %d\n",
				     info->si_signo, info->si_errno,
				     info->si_code);
		}
	} else {
		doio_fprintf(stderr, "signal_info: sig %d\n", sig);
	}
}

void cleanup_handler(int sig, siginfo_t * info, void *v)
{
	havesigint = 1;		/* in case there's a followup signal */
	/*signal_info(sig, info, v); *//* be quiet on "normal" kill */
	alloc_mem(-1);
	exit(0);
}

void die_handler(int sig, siginfo_t * info, void *v)
{
	doio_fprintf(stderr, "terminating on signal %d\n", sig);
	signal_info(sig, info, v);
	alloc_mem(-1);
	exit(1);
}

void sigbus_handler(int sig, siginfo_t * info, void *v)
{
	/* While we are doing a memcpy to/from an mmapped region we can
	   get a SIGBUS for a variety of reasons--and not all of them
	   should be considered failures.

	   Under normal conditions if we get a SIGINT it means we've been
	   told to shutdown.  However, if we're currently doing the above-
	   mentioned memcopy then the kernel will follow that SIGINT with
	   a SIGBUS.  We can guess that we're in this situation by seeing
	   that the si_errno field in the siginfo structure has EINTR as
	   an errno.  (We might make the guess stronger by looking at the
	   si_addr field to see that it's not faulting off the end of the
	   mmapped region, but it seems that in such a case havesigint
	   would not have been set so maybe that doesn't make the guess
	   stronger.)
	 */

	if (active_mmap_rw && havesigint && (info->si_errno == EINTR)) {
		cleanup_handler(sig, info, v);
	} else {
		die_handler(sig, info, v);
	}
}
#else

void cleanup_handler(int sig)
{
	havesigint = 1;		/* in case there's a followup signal */
	alloc_mem(-1);
	exit(0);
}

void die_handler(int sig)
{
	doio_fprintf(stderr, "terminating on signal %d\n", sig);
	alloc_mem(-1);
	exit(1);
}

#ifndef CRAY
void sigbus_handler(int sig)
{
	/* See sigbus_handler() in the 'ifdef sgi' case for details.  Here,
	   we don't have the siginfo stuff so the guess is weaker but we'll
	   do it anyway.
	 */

	if (active_mmap_rw && havesigint)
		cleanup_handler(sig);
	else
		die_handler(sig);
}
#endif /* !CRAY */
#endif /* sgi */

void noop_handler(int sig)
{
	return;
}

/*
 * SIGINT handler for the parent (original doio) process.  It simply sends
 * a SIGINT to all of the doio children.  Since they're all in the same
 * pgrp, this can be done with a single kill().
 */

void sigint_handler(int sig)
{
	int i;

	for (i = 0; i < Nchildren; i++) {
		if (Children[i] != -1) {
			kill(Children[i], SIGINT);
		}
	}
}

/*
 * Signal handler used to inform a process when async io completes.  Referenced
 * in do_read() and do_write().  Note that the signal handler is not
 * re-registered.
 */

void aio_handler(int sig)
{
	unsigned int i;
	struct aio_info *aiop;

	for (i = 0; i < sizeof(Aio_Info) / sizeof(Aio_Info[0]); i++) {
		aiop = &Aio_Info[i];

		if (aiop->strategy == A_SIGNAL && aiop->sig == sig) {
			aiop->signalled++;

			if (aio_done(aiop)) {
				aiop->done++;
			}
		}
	}
}

/*
 * dump info on all open aio slots
 */
void dump_aio(void)
{
	unsigned int i, count;

	count = 0;
	for (i = 0; i < sizeof(Aio_Info) / sizeof(Aio_Info[0]); i++) {
		if (Aio_Info[i].busy) {
			count++;
			fprintf(stderr,
				"Aio_Info[%03d] id=%d fd=%d signal=%d signaled=%d\n",
				i, Aio_Info[i].id,
				Aio_Info[i].fd,
				Aio_Info[i].sig, Aio_Info[i].signalled);
			fprintf(stderr, "\tstrategy=%s\n",
				format_strat(Aio_Info[i].strategy));
		}
	}
	fprintf(stderr, "%d active async i/os\n", count);
}

#ifdef sgi
/*
 * Signal handler called as a callback, not as a signal.
 * 'val' is the value from sigev_value and is assumed to be the
 * Aio_Info[] index.
 */
void cb_handler(sigval_t val)
{
	struct aio_info *aiop;

/*printf("cb_handler requesting slot %d\n", val.sival_int);*/
	aiop = aio_slot(val.sival_int);
/*printf("cb_handler, aiop=%p\n", aiop);*/

/*printf("%d in cb_handler\n", getpid() );*/
	if (aiop->strategy == A_CALLBACK) {
		aiop->signalled++;

		if (aio_done(aiop)) {
			aiop->done++;
		}
	}
}
#endif

struct aio_info *aio_slot(int aio_id)
{
	unsigned int i;
	static int id = 1;
	struct aio_info *aiop;

	aiop = NULL;

	for (i = 0; i < sizeof(Aio_Info) / sizeof(Aio_Info[0]); i++) {
		if (aio_id == -1) {
			if (!Aio_Info[i].busy) {
				aiop = &Aio_Info[i];
				aiop->busy = 1;
				aiop->id = id++;
				break;
			}
		} else {
			if (Aio_Info[i].busy && Aio_Info[i].id == aio_id) {
				aiop = &Aio_Info[i];
				break;
			}
		}
	}

	if (aiop == NULL) {
		doio_fprintf(stderr, "aio_slot(%d) not found.  Request %d\n",
			     aio_id, Reqno);
		dump_aio();
		alloc_mem(-1);
		exit(E_INTERNAL);
	}

	return aiop;
}

int aio_register(int fd, int strategy, int sig)
{
	struct aio_info *aiop;
	struct sigaction sa;

	aiop = aio_slot(-1);

	aiop->fd = fd;
	aiop->strategy = strategy;
	aiop->done = 0;
#ifdef CRAY
	memset((char *)&aiop->iosw, 0x00, sizeof(aiop->iosw));
#endif

	if (strategy == A_SIGNAL) {
		aiop->sig = sig;
		aiop->signalled = 0;

		sa.sa_handler = aio_handler;
		sa.sa_flags = 0;
		sigemptyset(&sa.sa_mask);

		sigaction(sig, &sa, &aiop->osa);
	} else {
		aiop->sig = -1;
		aiop->signalled = 0;
	}

	return aiop->id;
}

int aio_unregister(int aio_id)
{
	struct aio_info *aiop;

	aiop = aio_slot(aio_id);

	if (aiop->strategy == A_SIGNAL) {
		sigaction(aiop->sig, &aiop->osa, NULL);
	}

	aiop->busy = 0;
	return 0;
}

#ifndef __linux__
int aio_wait(int aio_id)
{
#ifdef RECALL_SIZEOF
	long mask[RECALL_SIZEOF];
#endif
	sigset_t signalset;
	struct aio_info *aiop;
#ifdef CRAY
	struct iosw *ioswlist[1];
#endif
#ifdef sgi
	const aiocb_t *aioary[1];
#endif
	int r, cnt;

	aiop = aio_slot(aio_id);
/*printf("%d aiop B =%p\n", getpid(), aiop);*/

	switch (aiop->strategy) {
	case A_POLL:
		while (!aio_done(aiop)) ;
		break;

	case A_SIGNAL:
		sigemptyset(&signalset);
		sighold(aiop->sig);

		while (!aiop->signalled || !aiop->done) {
			sigsuspend(&signalset);
			sighold(aiop->sig);
		}
		break;

#ifdef CRAY
	case A_RECALL:
		ioswlist[0] = &aiop->iosw;
		if (recall(aiop->fd, 1, ioswlist) < 0) {
			doio_fprintf(stderr, "recall() failed:  %s (%d)\n",
				     SYSERR, errno);
			exit(E_SETUP);
		}
		break;

#ifdef RECALL_SIZEOF

	case A_RECALLA:
		RECALL_INIT(mask);
		RECALL_SET(mask, aiop->fd);
		if (recalla(mask) < 0) {
			doio_fprintf(stderr, "recalla() failed:  %s (%d)\n",
				     SYSERR, errno);
			exit(E_SETUP);
		}

		RECALL_CLR(mask, aiop->fd);
		break;
#endif

	case A_RECALLS:
		ioswlist[0] = &aiop->iosw;
		if (recalls(1, ioswlist) < 0) {
			doio_fprintf(stderr, "recalls failed:  %s (%d)\n",
				     SYSERR, errno);
			exit(E_SETUP);
		}
		break;
#endif /* CRAY */

#ifdef sgi
	case A_CALLBACK:
		aioary[0] = &aiop->aiocb;
		cnt = 0;
		do {
			r = aio_suspend(aioary, 1, NULL);
			if (r == -1) {
				doio_fprintf(stderr,
					     "aio_suspend failed: %s (%d)\n",
					     SYSERR, errno);
				exit(E_SETUP);
			}
			cnt++;
		} while (aiop->done == 0);

#if 0
		/*
		 * after having this set for a while, I've decided that
		 * it's too noisy
		 */
		if (cnt > 1)
			doio_fprintf(stderr,
				     "aio_wait: callback wait took %d tries\n",
				     cnt);
#endif

		/*
		 * Note: cb_handler already calls aio_done
		 */
		break;

	case A_SUSPEND:
		aioary[0] = &aiop->aiocb;
		r = aio_suspend(aioary, 1, NULL);
		if (r == -1) {
			doio_fprintf(stderr, "aio_suspend failed: %s (%d)\n",
				     SYSERR, errno);
			exit(E_SETUP);
		}

		aio_done(aiop);
		break;
#endif
	}

/*printf("aio_wait: errno %d return %d\n", aiop->aio_errno, aiop->aio_ret);*/

	return 0;
}
#endif /* !linux */

/*
 * Format specified time into HH:MM:SS format.  t is the time to format
 * in seconds (as returned from time(2)).
 */

char *hms(time_t t)
{
	static char ascii_time[9];
	struct tm *ltime;

	ltime = localtime(&t);
	strftime(ascii_time, sizeof(ascii_time), "%H:%M:%S", ltime);

	return ascii_time;
}

/*
 * Simple routine to check if an async io request has completed.
 */

int aio_done(struct aio_info *ainfo)
{
#ifdef CRAY
	return ainfo->iosw.sw_flag;
#endif

#ifdef sgi
	if ((ainfo->aio_errno = aio_error(&ainfo->aiocb)) == -1) {
		doio_fprintf(stderr, "aio_done: aio_error failed: %s (%d)\n",
			     SYSERR, errno);
		exit(E_SETUP);
	}
	/*printf("%d aio_done aio_errno=%d\n", getpid(), ainfo->aio_errno); */
	if (ainfo->aio_errno != EINPROGRESS) {
		if ((ainfo->aio_ret = aio_return(&ainfo->aiocb)) == -1) {
			doio_fprintf(stderr,
				     "aio_done: aio_return failed: %s (%d)\n",
				     SYSERR, errno);
			exit(E_SETUP);
		}
	}

	return (ainfo->aio_errno != EINPROGRESS);
#else
	return -1;		/* invalid */
#endif
}

/*
 * Routine to handle upanic() - it first attempts to set the panic flag.  If
 * the flag cannot be set, an error message is issued.  A call to upanic
 * with PA_PANIC is then done unconditionally, in case the panic flag was set
 * from outside the program (as with the panic(8) program).
 *
 * Note - we only execute the upanic code if -U was used, and the passed in
 * mask is set in the Upanic_Conditions bitmask.
 */

void doio_upanic(int mask)
{
	if (U_opt == 0 || (mask & Upanic_Conditions) == 0) {
		return;
	}
#ifdef CRAY
	if (upanic(PA_SET) < 0) {
		doio_fprintf(stderr,
			     "WARNING - Could not set the panic flag - upanic(PA_SET) failed:  %s (%d)\n",
			     SYSERR, errno);
	}

	upanic(PA_PANIC);
#endif
#ifdef sgi
	syssgi(1005);		/* syssgi test panic - DEBUG kernels only */
#endif
	doio_fprintf(stderr, "WARNING - upanic() failed\n");
}

/*
 * Parse cmdline options/arguments and set appropriate global variables.
 * If the cmdline is valid, return 0 to caller.  Otherwise exit with a status
 * of 1.
 */

int parse_cmdline(int argc, char **argv, char *opts)
{
	int c;
	char cc, *cp = NULL, *tok = NULL;
	extern int opterr;
	extern int optind;
	extern char *optarg;
	struct smap *s;
	char *memargs[NMEMALLOC];
	int nmemargs, ma;

	if (*argv[0] == '-') {
		argv[0]++;
		Execd = 1;
	}

	if ((Prog = strrchr(argv[0], '/')) == NULL) {
		Prog = argv[0];
	} else {
		Prog++;
	}

	opterr = 0;
	while ((c = getopt(argc, argv, opts)) != EOF) {
		switch ((char)c) {
		case 'a':
			a_opt++;
			break;

		case 'C':
			C_opt++;
			for (s = checkmap; s->string != NULL; s++)
				if (!strcmp(s->string, optarg))
					break;
			if (s->string == NULL && tok != NULL) {
				fprintf(stderr,
					"%s%s:  Illegal -C arg (%s).  Must be one of: ",
					Prog, TagName, tok);

				for (s = checkmap; s->string != NULL; s++)
					fprintf(stderr, "%s ", s->string);
				fprintf(stderr, "\n");
				exit(1);
			}

			switch (s->value) {
			case C_DEFAULT:
				Data_Fill = doio_pat_fill;
				Data_Check = doio_pat_check;
				break;
			default:
				fprintf(stderr,
					"%s%s:  Unrecognised -C arg '%s' %d",
					Prog, TagName, s->string, s->value);
				exit(1);
			}
			break;

		case 'd':	/* delay between i/o ops */
			parse_delay(optarg);
			break;

		case 'e':
			if (Npes > 1 && Nprocs > 1) {
				fprintf(stderr,
					"%s%s:  Warning - Program is a multi-pe application - exec option is ignored.\n",
					Prog, TagName);
			}
			e_opt++;
			break;

		case 'h':
			help(stdout);
			exit(0);
			break;

		case 'k':
			k_opt++;
			break;

		case 'm':
			Message_Interval = strtol(optarg, &cp, 10);
			if (*cp != '\0' || Message_Interval < 0) {
				fprintf(stderr,
					"%s%s:  Illegal -m arg (%s):  Must be an integer >= 0\n",
					Prog, TagName, optarg);
				exit(1);
			}
			m_opt++;
			break;

		case 'M':	/* memory allocation types */
#ifndef CRAY
			nmemargs = string_to_tokens(optarg, memargs, 32, ",");
			for (ma = 0; ma < nmemargs; ma++) {
				parse_memalloc(memargs[ma]);
			}
			/*dump_memalloc(); */
#else
			fprintf(stderr,
				"%s%s: Error: -M isn't supported on this platform\n",
				Prog, TagName);
			exit(1);
#endif
			M_opt++;
			break;

		case 'N':
			sprintf(TagName, "(%.39s)", optarg);
			break;

		case 'n':
			Nprocs = strtol(optarg, &cp, 10);
			if (*cp != '\0' || Nprocs < 1) {
				fprintf(stderr,
					"%s%s:  Illegal -n arg (%s):  Must be integer > 0\n",
					Prog, TagName, optarg);
				exit(E_USAGE);
			}

			if (Npes > 1 && Nprocs > 1) {
				fprintf(stderr,
					"%s%s:  Program has been built as a multi-pe app.  -n1 is the only nprocs value allowed\n",
					Prog, TagName);
				exit(E_SETUP);
			}
			n_opt++;
			break;

		case 'r':
			Release_Interval = strtol(optarg, &cp, 10);
			if (*cp != '\0' || Release_Interval < 0) {
				fprintf(stderr,
					"%s%s:  Illegal -r arg (%s):  Must be integer >= 0\n",
					Prog, TagName, optarg);
				exit(E_USAGE);
			}

			r_opt++;
			break;

		case 'w':
			Write_Log = optarg;
			w_opt++;
			break;

		case 'v':
			v_opt++;
			break;

		case 'V':
			if (strcasecmp(optarg, "sync") == 0) {
				Validation_Flags = O_SYNC;
			} else if (strcasecmp(optarg, "buffered") == 0) {
				Validation_Flags = 0;
#ifdef CRAY
			} else if (strcasecmp(optarg, "parallel") == 0) {
				Validation_Flags = O_PARALLEL;
			} else if (strcasecmp(optarg, "ldraw") == 0) {
				Validation_Flags = O_LDRAW;
			} else if (strcasecmp(optarg, "raw") == 0) {
				Validation_Flags = O_RAW;
#endif
#ifdef sgi
			} else if (strcasecmp(optarg, "direct") == 0) {
				Validation_Flags = O_DIRECT;
#endif
			} else {
				if (sscanf
				    (optarg, "%i%c", &Validation_Flags,
				     &cc) != 1) {
					fprintf(stderr,
						"%s:  Invalid -V argument (%s) - must be a decimal, hex, or octal\n",
						Prog, optarg);
					fprintf(stderr,
						"    number, or one of the following strings:  'sync',\n");
					fprintf(stderr,
						"    'buffered', 'parallel', 'ldraw', or 'raw'\n");
					exit(E_USAGE);
				}
			}
			V_opt++;
			break;
		case 'U':
			tok = strtok(optarg, ",");
			while (tok != NULL) {
				for (s = Upanic_Args; s->string != NULL; s++)
					if (strcmp(s->string, tok) == 0)
						break;

				if (s->string == NULL) {
					fprintf(stderr,
						"%s%s:  Illegal -U arg (%s).  Must be one of: ",
						Prog, TagName, tok);

					for (s = Upanic_Args; s->string != NULL;
					     s++)
						fprintf(stderr, "%s ",
							s->string);

					fprintf(stderr, "\n");

					exit(1);
				}

				Upanic_Conditions |= s->value;
				tok = strtok(NULL, ",");
			}

			U_opt++;
			break;

		case '?':
			usage(stderr);
			exit(E_USAGE);
			break;
		}
	}

	/*
	 * Supply defaults
	 */

	if (!C_opt) {
		Data_Fill = doio_pat_fill;
		Data_Check = doio_pat_check;
	}

	if (!U_opt)
		Upanic_Conditions = 0;

	if (!n_opt)
		Nprocs = 1;

	if (!r_opt)
		Release_Interval = DEF_RELEASE_INTERVAL;

	if (!M_opt) {
		Memalloc[Nmemalloc].memtype = MEM_DATA;
		Memalloc[Nmemalloc].flags = 0;
		Memalloc[Nmemalloc].name = NULL;
		Memalloc[Nmemalloc].space = NULL;
		Nmemalloc++;
	}

	/*
	 * Initialize input stream
	 */

	if (argc == optind) {
		Infile = NULL;
	} else {
		Infile = argv[optind++];
	}

	if (argc != optind) {
		usage(stderr);
		exit(E_USAGE);
	}

	return 0;
}

/*
 * Parse memory allocation types
 *
 * Types are:
 *  Data
 *  T3E-shmem:blksize[:nblks]
 *  SysV-shmem:shmid:blksize:nblks
 *	if shmid is "private", use IPC_PRIVATE
 *	and nblks is not required
 *
 *  mmap:flags:filename:blksize[:nblks]
 *   flags are one of:
 *	p - private (MAP_PRIVATE)
 *	a - private, MAP_AUTORESRV
 *	l - local (MAP_LOCAL)
 *	s - shared (nblks required)
 *
 *   plus any of:
 *	f - fixed address (MAP_FIXED)
 *	A - use an address without MAP_FIXED
 *	a - autogrow (map once at startup)
 *
 *  mmap:flags:devzero
 *	mmap /dev/zero  (shared not allowd)
 *	maps the first 4096 bytes of /dev/zero
 *
 * - put a directory at the beginning of the shared
 *   regions saying what pid has what region.
 *	DIRMAGIC
 *	BLKSIZE
 *	NBLKS
 *	nblks worth of directories - 1 int pids
 */
#ifndef CRAY
void parse_memalloc(char *arg)
{
	char *allocargs[NMEMALLOC];
	int nalloc;
	struct memalloc *M;

	if (Nmemalloc >= NMEMALLOC) {
		doio_fprintf(stderr, "Error - too many memory types (%d).\n",
			     Nmemalloc);
		return;
	}

	M = &Memalloc[Nmemalloc];

	nalloc = string_to_tokens(arg, allocargs, 32, ":");
	if (!strcmp(allocargs[0], "data")) {
		M->memtype = MEM_DATA;
		M->flags = 0;
		M->name = NULL;
		M->space = NULL;
		Nmemalloc++;
		if (nalloc >= 2) {
			if (strchr(allocargs[1], 'p'))
				M->flags |= MEMF_MPIN;
		}
	} else if (!strcmp(allocargs[0], "mmap")) {
		/* mmap:flags:filename[:size] */
		M->memtype = MEM_MMAP;
		M->flags = 0;
		M->space = NULL;
		if (nalloc >= 1) {
			if (strchr(allocargs[1], 'p'))
				M->flags |= MEMF_PRIVATE;
			if (strchr(allocargs[1], 'a'))
				M->flags |= MEMF_AUTORESRV;
			if (strchr(allocargs[1], 'l'))
				M->flags |= MEMF_LOCAL;
			if (strchr(allocargs[1], 's'))
				M->flags |= MEMF_SHARED;

			if (strchr(allocargs[1], 'f'))
				M->flags |= MEMF_FIXADDR;
			if (strchr(allocargs[1], 'A'))
				M->flags |= MEMF_ADDR;
			if (strchr(allocargs[1], 'G'))
				M->flags |= MEMF_AUTOGROW;

			if (strchr(allocargs[1], 'U'))
				M->flags |= MEMF_FILE;
		} else {
			M->flags |= MEMF_PRIVATE;
		}

		if (nalloc > 2) {
			if (!strcmp(allocargs[2], "devzero")) {
				M->name = "/dev/zero";
				if (M->flags &
				    ((MEMF_PRIVATE | MEMF_LOCAL) == 0))
					M->flags |= MEMF_PRIVATE;
			} else {
				M->name = allocargs[2];
			}
		} else {
			M->name = "/dev/zero";
			if (M->flags & ((MEMF_PRIVATE | MEMF_LOCAL) == 0))
				M->flags |= MEMF_PRIVATE;
		}
		Nmemalloc++;

	} else if (!strcmp(allocargs[0], "shmem")) {
		/* shmem:shmid:size */
		M->memtype = MEM_SHMEM;
		M->flags = 0;
		M->space = NULL;
		if (nalloc >= 2) {
			M->name = allocargs[1];
		} else {
			M->name = NULL;
		}
		if (nalloc >= 3) {
			sscanf(allocargs[2], "%i", &M->nblks);
		} else {
			M->nblks = 0;
		}
		if (nalloc >= 4) {
			if (strchr(allocargs[3], 'p'))
				M->flags |= MEMF_MPIN;
		}

		Nmemalloc++;
	} else {
		doio_fprintf(stderr, "Error - unknown memory type '%s'.\n",
			     allocargs[0]);
		exit(1);
	}
}

void dump_memalloc(void)
{
	int ma;
	char *mt;

	if (Nmemalloc == 0) {
		printf("No memory allocation strategies devined\n");
		return;
	}

	for (ma = 0; ma < Nmemalloc; ma++) {
		switch (Memalloc[ma].memtype) {
		case MEM_DATA:
			mt = "data";
			break;
		case MEM_SHMEM:
			mt = "shmem";
			break;
		case MEM_MMAP:
			mt = "mmap";
			break;
		default:
			mt = "unknown";
			break;
		}
		printf("mstrat[%d] = %d %s\n", ma, Memalloc[ma].memtype, mt);
		printf("\tflags=%#o name='%s' nblks=%d\n",
		       Memalloc[ma].flags,
		       Memalloc[ma].name, Memalloc[ma].nblks);
	}
}

#endif /* !CRAY */

/*
 * -d <op>:<time> - doio inter-operation delay
 *	currently this permits ONE type of delay between operations.
 */

void parse_delay(char *arg)
{
	char *delayargs[NMEMALLOC];
	int ndelay;
	struct smap *s;

	ndelay = string_to_tokens(arg, delayargs, 32, ":");
	if (ndelay < 2) {
		doio_fprintf(stderr,
			     "Illegal delay arg (%s). Must be operation:time\n",
			     arg);
		exit(1);
	}
	for (s = delaymap; s->string != NULL; s++)
		if (!strcmp(s->string, delayargs[0]))
			break;
	if (s->string == NULL) {
		fprintf(stderr,
			"Illegal Delay arg (%s).  Must be one of: ", arg);

		for (s = delaymap; s->string != NULL; s++)
			fprintf(stderr, "%s ", s->string);
		fprintf(stderr, "\n");
		exit(1);
	}

	delayop = s->value;

	sscanf(delayargs[1], "%i", &delaytime);

	if (ndelay > 2) {
		fprintf(stderr, "Warning: extra delay arguments ignored.\n");
	}
}

/*
 * Usage clause - obvious
 */

int usage(FILE * stream)
{
	/*
	 * Only do this if we are on vpe 0, to avoid seeing it from every
	 * process in the application.
	 */

	if (Npes > 1 && Vpe != 0) {
		return 0;
	}

	fprintf(stream,
		"usage%s:  %s [-aekv] [-m message_interval] [-n nprocs] [-r release_interval] [-w write_log] [-V validation_ftype] [-U upanic_cond] [infile]\n",
		TagName, Prog);
	return 0;
}

void help(FILE * stream)
{
	/*
	 * Only the app running on vpe 0 gets to issue help - this prevents
	 * everybody in the application from doing this.
	 */

	if (Npes > 1 && Vpe != 0) {
		return;
	}

	usage(stream);
	fprintf(stream, "\n");
	fprintf(stream,
		"\t-a                   abort - kill all doio processes on data compare\n");
	fprintf(stream,
		"\t                     errors.  Normally only the erroring process exits\n");
	fprintf(stream, "\t-C data-pattern-type \n");
	fprintf(stream,
		"\t                     Available data patterns are:\n");
	fprintf(stream, "\t                     default - repeating pattern\n");
	fprintf(stream, "\t-d Operation:Time    Inter-operation delay.\n");
	fprintf(stream, "\t                     Operations are:\n");
	fprintf(stream,
		"\t                         select:time (1 second=1000000)\n");
	fprintf(stream, "\t                         sleep:time (1 second=1)\n");
#ifdef sgi
	fprintf(stream,
		"\t                         sginap:time (1 second=CLK_TCK=100)\n");
#endif
	fprintf(stream, "\t                         alarm:time (1 second=1)\n");
	fprintf(stream,
		"\t-e                   Re-exec children before entering the main\n");
	fprintf(stream,
		"\t                     loop.  This is useful for spreading\n");
	fprintf(stream,
		"\t                     procs around on multi-pe systems.\n");
	fprintf(stream,
		"\t-k                   Lock file regions during writes using fcntl()\n");
	fprintf(stream,
		"\t-v                   Verify writes - this is done by doing a buffered\n");
	fprintf(stream,
		"\t                     read() of the data if file io was done, or\n");
	fprintf(stream,
		"\t                     an ssread()of the data if sds io was done\n");
#ifndef CRAY
	fprintf(stream,
		"\t-M                   Data buffer allocation method\n");
	fprintf(stream, "\t                     alloc-type[,type]\n");
#ifdef sgi
	fprintf(stream, "\t			    data:flags\n");
	fprintf(stream, "\t			        p - mpin buffer\n");
	fprintf(stream, "\t			    shmem:shmid:size:flags\n");
	fprintf(stream, "\t			        p - mpin buffer\n");
#else
	fprintf(stream, "\t			    data\n");
	fprintf(stream, "\t			    shmem:shmid:size\n");
#endif /* sgi */
	fprintf(stream, "\t			    mmap:flags:filename\n");
	fprintf(stream, "\t			        p - private\n");
#ifdef sgi
	fprintf(stream, "\t			        s - shared\n");
	fprintf(stream, "\t			        l - local\n");
	fprintf(stream, "\t			        a - autoresrv\n");
	fprintf(stream, "\t			        G - autogrow\n");
#else
	fprintf(stream,
		"\t			        s - shared (shared file must exist\n"),
	    fprintf(stream,
		    "\t			            and have needed length)\n");
#endif
	fprintf(stream,
		"\t			        f - fixed address (not used)\n");
	fprintf(stream,
		"\t			        a - specify address (not used)\n");
	fprintf(stream,
		"\t			        U - Unlink file when done\n");
	fprintf(stream,
		"\t			        The default flag is private\n");
	fprintf(stream, "\n");
#endif /* !CRAY */
	fprintf(stream,
		"\t-m message_interval  Generate a message every 'message_interval'\n");
	fprintf(stream,
		"\t                     requests.  An interval of 0 suppresses\n");
	fprintf(stream,
		"\t                     messages.  The default is 0.\n");
	fprintf(stream, "\t-N tagname           Tag name, for Monster.\n");
	fprintf(stream, "\t-n nprocs            # of processes to start up\n");
	fprintf(stream,
		"\t-r release_interval  Release all memory and close\n");
	fprintf(stream,
		"\t                     files every release_interval operations.\n");
	fprintf(stream,
		"\t                     By default procs never release memory\n");
	fprintf(stream,
		"\t                     or close fds unless they have to.\n");
	fprintf(stream,
		"\t-V validation_ftype  The type of file descriptor to use for doing data\n");
	fprintf(stream,
		"\t                     validation.  validation_ftype may be an octal,\n");
	fprintf(stream,
		"\t                     hex, or decimal number representing the open()\n");
	fprintf(stream,
		"\t                     flags, or may be one of the following strings:\n");
	fprintf(stream,
		"\t                     'buffered' - validate using bufferd read\n");
	fprintf(stream,
		"\t                     'sync'     - validate using O_SYNC read\n");
#ifdef sgi
	fprintf(stream,
		"\t                     'direct    - validate using O_DIRECT read'\n");
#endif
#ifdef CRAY
	fprintf(stream,
		"\t                     'ldraw'    - validate using O_LDRAW read\n");
	fprintf(stream,
		"\t                     'parallel' - validate using O_PARALLEL read\n");
	fprintf(stream,
		"\t                     'raw'      - validate using O_RAW read\n");
#endif
	fprintf(stream, "\t                     By default, 'parallel'\n");
	fprintf(stream,
		"\t                     is used if the write was done with O_PARALLEL\n");
	fprintf(stream,
		"\t                     or 'buffered' for all other writes.\n");
	fprintf(stream,
		"\t-w write_log         File to log file writes to.  The doio_check\n");
	fprintf(stream,
		"\t                     program can reconstruct datafiles using the\n");
	fprintf(stream,
		"\t                     write_log, and detect if a file is corrupt\n");
	fprintf(stream,
		"\t                     after all procs have exited.\n");
	fprintf(stream,
		"\t-U upanic_cond       Comma separated list of conditions that will\n");
	fprintf(stream,
		"\t                     cause a call to upanic(PA_PANIC).\n");
	fprintf(stream,
		"\t                     'corruption' -> upanic on bad data comparisons\n");
	fprintf(stream,
		"\t                     'iosw'     ---> upanic on unexpected async iosw\n");
	fprintf(stream,
		"\t                     'rval'     ---> upanic on unexpected syscall rvals\n");
	fprintf(stream,
		"\t                     'all'      ---> all of the above\n");
	fprintf(stream, "\n");
	fprintf(stream,
		"\tinfile               Input stream - default is stdin - must be a list\n");
	fprintf(stream,
		"\t                     of io_req structures (see doio.h).  Currently\n");
	fprintf(stream,
		"\t                     only the iogen program generates the proper\n");
	fprintf(stream, "\t                     format\n");
}