/*
 * Blktrace replay utility - Play traces back
 *
 * Copyright (C) 2007 Alan D. Brunelle <Alan.Brunelle@hp.com>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <libaio.h>
#include <pthread.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <dirent.h>
#include <stdarg.h>

#if !defined(_GNU_SOURCE)
#	define _GNU_SOURCE
#endif
#include <getopt.h>

#include "list.h"
#include "btrecord.h"

/* 
 * ========================================================================
 * ==== STRUCTURE DEFINITIONS =============================================
 * ========================================================================
 */

/**
 * Each device map has one of these:
 * 
 * @head:	Linked on to map_devs
 * @from_dev:	Device name as seen on recorded system
 * @to_dev:	Device name to be used on replay system
 */
struct map_dev {
	struct list_head head;
	char *from_dev, *to_dev;
};

/**
 * Each device name specified has one of these (until threads are created)
 *
 * @head: 	Linked onto input_devs
 * @devnm: 	Device name -- 'sd*'
 */
struct dev_info {
	struct list_head head;
	char *devnm;
};

/*
 * Per input file information
 *
 * @head: 	Used to link up on input_files
 * @free_iocbs: List of free iocb's available for use
 * @used_iocbs: List of iocb's currently outstanding
 * @mutex: 	Mutex used with condition variable to protect volatile values
 * @cond: 	Condition variable used when waiting on a volatile value change
 * @naios_out: 	Current number of AIOs outstanding on this context
 * @naios_free: Number of AIOs on the free list (short cut for list_len)
 * @send_wait: 	Boolean: When true, the sub thread is waiting on free IOCBs
 * @reap_wait: 	Boolean: When true, the rec thread is waiting on used IOCBs
 * @send_done: 	Boolean: When true, the sub thread has completed work
 * @reap_done: 	Boolean: When true, the rec thread has completed work
 * @sub_thread: Thread used to submit IOs.
 * @rec_thread: Thread used to reclaim IOs.
 * @ctx: 	IO context
 * @devnm: 	Copy of the device name being managed by this thread
 * @file_name: 	Full name of the input file
 * @cpu: 	CPU this thread is pinned to
 * @ifd: 	Input file descriptor
 * @ofd: 	Output file descriptor
 * @iterations: Remaining iterations to process
 * @vfp:	For verbose dumping of actions performed
 */
struct thr_info {
	struct list_head head, free_iocbs, used_iocbs;
	pthread_mutex_t mutex;
	pthread_cond_t cond;
	volatile long naios_out, naios_free;
	volatile int send_wait, reap_wait, send_done, reap_done;
	pthread_t sub_thread, rec_thread;
	io_context_t ctx;
	char *devnm, *file_name;
	int cpu, ifd, ofd, iterations;
	FILE *vfp;
};

/*
 * Every Asynchronous IO used has one of these (naios per file/device).
 *
 * @iocb:	IOCB sent down via io_submit
 * @head:	Linked onto file_list.free_iocbs or file_list.used_iocbs
 * @tip:	Pointer to per-thread information this IO is associated with
 * @nbytes:	Number of bytes in buffer associated with iocb
 */
struct iocb_pkt {
	struct iocb iocb;
	struct list_head head;
	struct thr_info *tip;
	int nbytes;
};

/* 
 * ========================================================================
 * ==== GLOBAL VARIABLES ==================================================
 * ========================================================================
 */

static volatile int signal_done = 0;	// Boolean: Signal'ed, need to quit

static char *ibase = "replay";		// Input base name
static char *idir = ".";		// Input directory base
static int cpus_to_use = -1;		// Number of CPUs to use
static int def_iterations = 1;		// Default number of iterations
static int naios = 512;			// Number of AIOs per thread
static int ncpus = 0;			// Number of CPUs in the system
static int verbose = 0;			// Boolean: Output some extra info
static int write_enabled = 0;		// Boolean: Enable writing
static __u64 genesis = ~0;		// Earliest time seen
static __u64 rgenesis;			// Our start time
static size_t pgsize;			// System Page size
static int nb_sec = 512;		// Number of bytes per sector
static LIST_HEAD(input_devs);		// List of devices to handle
static LIST_HEAD(input_files);		// List of input files to handle
static LIST_HEAD(map_devs);		// List of device maps
static int nfiles = 0;			// Number of files to handle
static int no_stalls = 0;		// Boolean: Disable pre-stalls
static unsigned acc_factor = 1;		// Int: Acceleration factor
static int find_records = 0;		// Boolean: Find record files auto

/*
 * Variables managed under control of condition variables.
 *
 * n_reclaims_done: 	Counts number of reclaim threads that have completed.
 * n_replays_done:	Counts number of replay threads that have completed.
 * n_replays_ready:	Counts number of replay threads ready to start.
 * n_iters_done:	Counts number of replay threads done one iteration.
 * iter_start:		Starts an iteration for the replay threads.
 */
static volatile int n_reclaims_done = 0;
static pthread_mutex_t reclaim_done_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t reclaim_done_cond = PTHREAD_COND_INITIALIZER;

static volatile int n_replays_done = 0;
static pthread_mutex_t replay_done_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t replay_done_cond = PTHREAD_COND_INITIALIZER;

static volatile int n_replays_ready = 0;
static pthread_mutex_t replay_ready_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t replay_ready_cond = PTHREAD_COND_INITIALIZER;

static volatile int n_iters_done = 0;
static pthread_mutex_t iter_done_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t iter_done_cond = PTHREAD_COND_INITIALIZER;

static volatile int iter_start = 0;
static pthread_mutex_t iter_start_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t iter_start_cond = PTHREAD_COND_INITIALIZER;

/* 
 * ========================================================================
 * ==== FORWARD REFERENECES ===============================================
 * ========================================================================
 */

static void *replay_sub(void *arg);
static void *replay_rec(void *arg);
static char usage_str[];

/* 
 * ========================================================================
 * ==== INLINE ROUTINES ===================================================
 * ========================================================================
 */

/*
 * The 'fatal' macro will output a perror message (if errstring is !NULL)
 * and display a string (with variable arguments) and then exit with the 
 * specified exit value.
 */
#define ERR_ARGS			1
#define ERR_SYSCALL			2
static inline void fatal(const char *errstring, const int exitval,
			 const char *fmt, ...)
{
	va_list ap;

	if (errstring)
		perror(errstring);

	va_start(ap, fmt);
	vfprintf(stderr, fmt, ap);
	va_end(ap);

	exit(exitval);
	/*NOTREACHED*/
}

static inline long long unsigned du64_to_sec(__u64 du64)
{
	return (long long unsigned)du64 / (1000 * 1000 * 1000);
}

static inline long long unsigned du64_to_nsec(__u64 du64)
{
	return llabs((long long)du64) % (1000 * 1000 * 1000);
}

/**
 * min - Return minimum of two integers
 */
static inline int min(int a, int b)
{ 
	return a < b ? a : b;
}

/**
 * minl - Return minimum of two longs
 */
static inline long minl(long a, long b)
{ 
	return a < b ? a : b;
}

/**
 * usage - Display usage string and version
 */
static inline void usage(void)
{
	fprintf(stderr, "Usage: btreplay -- version %s\n%s", 
		my_btversion, usage_str);
}

/**
 * is_send_done - Returns true if sender should quit early
 * @tip: Per-thread information
 */
static inline int is_send_done(struct thr_info *tip)
{
	return signal_done || tip->send_done;
}

/**
 * is_reap_done - Returns true if reaper should quit early
 * @tip: Per-thread information
 */
static inline int is_reap_done(struct thr_info *tip)
{
	return signal_done || (tip->send_done && tip->naios_out == 0);
}

/**
 * ts2ns - Convert timespec values to a nanosecond value
 */
#define NS_TICKS		((__u64)1000 * (__u64)1000 * (__u64)1000)
static inline __u64 ts2ns(struct timespec *ts)
{
	return ((__u64)(ts->tv_sec) * NS_TICKS) + (__u64)(ts->tv_nsec);
}

/**
 * ts2ns - Convert timeval values to a nanosecond value
 */
static inline __u64 tv2ns(struct timeval *tp)
{
	return ((__u64)(tp->tv_sec)) + ((__u64)(tp->tv_usec) * (__u64)1000);
}

/**
 * touch_memory - Force physical memory to be allocating it
 * 
 * For malloc()ed memory we need to /touch/ it to make it really
 * exist. Otherwise, for write's (to storage) things may not work
 * as planned - we see Linux just use a single area to /read/ from
 * (as there isn't any memory that has been associated with the 
 * allocated virtual addresses yet).
 */
static inline void touch_memory(char *buf, size_t bsize)
{
#if defined(PREP_BUFS)
	memset(buf, 0, bsize);
#else
	size_t i;

	for (i = 0; i < bsize; i += pgsize)
		buf[i] = 0;
#endif
}

/**
 * buf_alloc - Returns a page-aligned buffer of the specified size
 * @nbytes: Number of bytes to allocate
 */
static inline void *buf_alloc(size_t nbytes)
{
	void *buf;

	if (posix_memalign(&buf, pgsize, nbytes)) {
		fatal("posix_memalign", ERR_SYSCALL, "Allocation failed\n");
		/*NOTREACHED*/
	}

	return buf;
}

/**
 * gettime - Returns current time 
 */
static inline __u64 gettime(void)
{
	static int use_clock_gettime = -1;		// Which clock to use

	if (use_clock_gettime < 0) {
		use_clock_gettime = clock_getres(CLOCK_MONOTONIC, NULL) == 0;
		if (use_clock_gettime) {
			struct timespec ts = {
				.tv_sec = 0,
				.tv_nsec = 0
			};
			clock_settime(CLOCK_MONOTONIC, &ts);
		}
	}

	if (use_clock_gettime) {
		struct timespec ts;
		clock_gettime(CLOCK_MONOTONIC, &ts);
		return ts2ns(&ts);
	}
	else {
		struct timeval tp;
		gettimeofday(&tp, NULL);
		return tv2ns(&tp);
	}
}

/**
 * setup_signal - Set up a signal handler for the specified signum
 */
static inline void setup_signal(int signum, sighandler_t handler)
{
	if (signal(signum, handler) == SIG_ERR) {
		fatal("signal", ERR_SYSCALL, "Failed to set signal %d\n",
			signum);
		/*NOTREACHED*/
	}
}

/* 
 * ========================================================================
 * ==== CONDITION VARIABLE ROUTINES =======================================
 * ========================================================================
 */

/**
 * __set_cv - Increments a variable under condition variable control.
 * @pmp: 	Pointer to the associated mutex
 * @pcp: 	Pointer to the associated condition variable
 * @vp: 	Pointer to the variable being incremented
 * @mxv: 	Max value for variable (Used only when ASSERTS are on)
 */
static inline void __set_cv(pthread_mutex_t *pmp, pthread_cond_t *pcp,
			    volatile int *vp, 
			    __attribute__((__unused__))int mxv)
{
	pthread_mutex_lock(pmp);
	assert(*vp < mxv);
	*vp += 1;
	pthread_cond_signal(pcp);
	pthread_mutex_unlock(pmp);
}

/**
 * __wait_cv - Waits for a variable under cond var control to hit a value
 * @pmp: 	Pointer to the associated mutex
 * @pcp: 	Pointer to the associated condition variable
 * @vp: 	Pointer to the variable being incremented
 * @mxv: 	Value to wait for
 */
static inline void __wait_cv(pthread_mutex_t *pmp, pthread_cond_t *pcp,
			     volatile int *vp, int mxv)
{
	pthread_mutex_lock(pmp);
	while (*vp < mxv)
		pthread_cond_wait(pcp, pmp);
	*vp = 0;
	pthread_mutex_unlock(pmp);
}

static inline void set_reclaim_done(void)
{
	__set_cv(&reclaim_done_mutex, &reclaim_done_cond, &n_reclaims_done,
		 nfiles);
}

static inline void wait_reclaims_done(void)
{
	__wait_cv(&reclaim_done_mutex, &reclaim_done_cond, &n_reclaims_done,
		  nfiles);
}

static inline void set_replay_ready(void)
{
	__set_cv(&replay_ready_mutex, &replay_ready_cond, &n_replays_ready,
		 nfiles);
}

static inline void wait_replays_ready(void)
{
	__wait_cv(&replay_ready_mutex, &replay_ready_cond, &n_replays_ready,
		  nfiles);
}

static inline void set_replay_done(void)
{
	__set_cv(&replay_done_mutex, &replay_done_cond, &n_replays_done,
		nfiles);
}

static inline void wait_replays_done(void)
{
	__wait_cv(&replay_done_mutex, &replay_done_cond, &n_replays_done,
		  nfiles);
}

static inline void set_iter_done(void)
{
	__set_cv(&iter_done_mutex, &iter_done_cond, &n_iters_done,
		nfiles);
}

static inline void wait_iters_done(void)
{
	__wait_cv(&iter_done_mutex, &iter_done_cond, &n_iters_done,
		  nfiles);
}

/**
 * wait_iter_start - Wait for an iteration to start 
 * 
 * This is /slightly/ different: we are waiting for a value to become
 * non-zero, and then we decrement it and go on. 
 */
static inline void wait_iter_start(void)
{
	pthread_mutex_lock(&iter_start_mutex);
	while (iter_start == 0)
		pthread_cond_wait(&iter_start_cond, &iter_start_mutex);
	assert(1 <= iter_start && iter_start <= nfiles);
	iter_start--;
	pthread_mutex_unlock(&iter_start_mutex);
}

/**
 * start_iter - Start an iteration at the replay thread level
 */
static inline void start_iter(void)
{
	pthread_mutex_lock(&iter_start_mutex);
	assert(iter_start == 0);
	iter_start = nfiles;
	pthread_cond_broadcast(&iter_start_cond);
	pthread_mutex_unlock(&iter_start_mutex);
}

/* 
 * ========================================================================
 * ==== CPU RELATED ROUTINES ==============================================
 * ========================================================================
 */

/**
 * get_ncpus - Sets up the global 'ncpus' value
 */
static void get_ncpus(void)
{
#ifdef _SC_NPROCESSORS_ONLN
	ncpus = sysconf(_SC_NPROCESSORS_ONLN);
#else
	int nrcpus = 4096;
	cpu_set_t * cpus;
	
realloc:
	cpus = CPU_ALLOC(nrcpus);
	size = CPU_ALLOC_SIZE(nrcpus);
	CPU_ZERO_S(size, cpus);

	if (sched_getaffinity(0, size, cpus)) {
		if( errno == EINVAL && nrcpus < (4096<<4) ) {
			CPU_FREE(cpus);
			nrcpus <<= 1;
			goto realloc;
		}
		fatal("sched_getaffinity", ERR_SYSCALL, "Can't get CPU info\n");
		/*NOTREACHED*/
	}

	ncpus = -1;
	for (last_cpu = 0; last_cpu < CPU_SETSIZE && CPU_ISSET(last_cpu, &cpus); last_cpu++)
		if (CPU_ISSET( last_cpu, &cpus) ) 
			ncpus = last_cpu;
	ncpus++;
	CPU_FREE(cpus);
#endif
	if (ncpus == 0) {
		fatal(NULL, ERR_SYSCALL, "Insufficient number of CPUs\n");
		/*NOTREACHED*/
	}
}

/**
 * pin_to_cpu - Pin this thread to a specific CPU
 * @tip: Thread information
 */
static void pin_to_cpu(struct thr_info *tip)
{
	cpu_set_t *cpus;
	size_t size;

	cpus = CPU_ALLOC(ncpus);
	size = CPU_ALLOC_SIZE(ncpus);	

	assert(0 <= tip->cpu && tip->cpu < ncpus);

	CPU_ZERO_S(size, cpus);
	CPU_SET_S(tip->cpu, size, cpus);
	if (sched_setaffinity(0, size, cpus)) {
		fatal("sched_setaffinity", ERR_SYSCALL, "Failed to pin CPU\n");
		/*NOTREACHED*/
	}
	assert(tip->cpu == sched_getcpu());

	if (verbose > 1) {
		int i;
		cpu_set_t *now = CPU_ALLOC(ncpus);

		(void)sched_getaffinity(0, size, now);
		fprintf(tip->vfp, "Pinned to CPU %02d ", tip->cpu);
		for (i = 0; i < ncpus; i++)
			fprintf(tip->vfp, "%1d", CPU_ISSET_S(i, size, now));
		fprintf(tip->vfp, "\n");
	}
}

/* 
 * ========================================================================
 * ==== INPUT DEVICE HANDLERS =============================================
 * ========================================================================
 */

/**
 * add_input_dev - Add a device ('sd*') to the list of devices to handle
 */
static void add_input_dev(char *devnm)
{
	struct list_head *p;
	struct dev_info *dip;

	__list_for_each(p, &input_devs) {
		dip = list_entry(p, struct dev_info, head);
		if (strcmp(dip->devnm, devnm) == 0)
			return;
	}

	dip = malloc(sizeof(*dip));
	dip->devnm = strdup(devnm);
	list_add_tail(&dip->head, &input_devs);
}

/**
 * rem_input_dev - Remove resources associated with this device
 */
static void rem_input_dev(struct dev_info *dip)
{
	list_del(&dip->head);
	free(dip->devnm);
	free(dip);
}

static void find_input_devs(char *idir)
{
	struct dirent *ent;
	DIR *dir = opendir(idir);

	if (dir == NULL) {
		fatal(idir, ERR_ARGS, "Unable to open %s\n", idir);
		/*NOTREACHED*/
	}

	while ((ent = readdir(dir)) != NULL) {
		char *p, *dsf;

		if (strstr(ent->d_name, ".replay.") == NULL)
			continue;

		dsf = strdup(ent->d_name);
		p = index(dsf, '.');
		assert(p != NULL);
		*p = '\0';
		add_input_dev(dsf);
		free(dsf);
	}

	closedir(dir);
}

/* 
 * ========================================================================
 * ==== MAP DEVICE INTERFACES =============================================
 * ========================================================================
 */

/**
 * read_map_devs - Read in a set of device mapping from the provided file.
 * @file_name:	File containing device maps
 *
 * We support the notion of multiple such files being specifed on the cmd line
 */
static void read_map_devs(char *file_name)
{
	FILE *fp;
	char from_dev[256], to_dev[256];

	fp = fopen(file_name, "r");
	if (!fp) {
		fatal(file_name, ERR_SYSCALL, "Could not open map devs file\n");
		/*NOTREACHED*/
	}

	while (fscanf(fp, "%s %s", from_dev, to_dev) == 2) {
		struct map_dev *mdp = malloc(sizeof(*mdp));

		mdp->from_dev = from_dev;
		mdp->to_dev = to_dev;
		list_add_tail(&mdp->head, &map_devs);
	}

	fclose(fp);
}

/**
 * release_map_devs - Release resources associated with device mappings.
 */
static void release_map_devs(void)
{
	struct list_head *p, *q;

	list_for_each_safe(p, q, &map_devs) {
		struct map_dev *mdp = list_entry(p, struct map_dev, head);

		list_del(&mdp->head);

		free(mdp->from_dev);
		free(mdp->to_dev);
		free(mdp);
	}
}

/**
 * map_dev - Return the mapped device for that specified
 * @from_dev:	Device name as seen on recorded system
 *
 * Note: If there is no such mapping, we return the same name.
 */
static char *map_dev(char *from_dev)
{
	struct list_head *p;

	__list_for_each(p, &map_devs) {
		struct map_dev *mdp = list_entry(p, struct map_dev, head);

		if (strcmp(from_dev, mdp->from_dev) == 0)
			return mdp->to_dev;
	}

	return from_dev;
}

/* 
 * ========================================================================
 * ==== IOCB MANAGEMENT ROUTINES ==========================================
 * ========================================================================
 */

/**
 * iocb_init - Initialize the fields of an IOCB
 * @tip: Per-thread information
 * iocbp: IOCB pointer to update
 */
static void iocb_init(struct thr_info *tip, struct iocb_pkt *iocbp)
{
	iocbp->tip = tip;
	iocbp->nbytes = 0;
	iocbp->iocb.u.c.buf = NULL;
}

/**
 * iocb_setup - Set up an iocb with this AIOs information
 * @iocbp: IOCB pointer to update
 * @rw: Direction (0 == write, 1 == read)
 * @n: Number of bytes to transfer
 * @off: Offset (in bytes)
 */
static void iocb_setup(struct iocb_pkt *iocbp, int rw, int n, long long off)
{
	char *buf;
	struct iocb *iop = &iocbp->iocb;

	assert(rw == 0 || rw == 1);
	assert(0 < n && (n % nb_sec) == 0);
	assert(0 <= off);

	if (iocbp->nbytes) {
		if (iocbp->nbytes >= n) {
			buf = iop->u.c.buf;
			goto prep;
		}

		assert(iop->u.c.buf);
		free(iop->u.c.buf);
	}

	buf = buf_alloc(n);
	iocbp->nbytes = n;

prep:
	if (rw)
		io_prep_pread(iop, iocbp->tip->ofd, buf, n, off);
	else {
		assert(write_enabled);
		io_prep_pwrite(iop, iocbp->tip->ofd, buf, n, off);
		touch_memory(buf, n);
	}

	iop->data = iocbp;
}

/* 
 * ========================================================================
 * ==== PER-THREAD SET UP & TEAR DOWN =====================================
 * ========================================================================
 */

/**
 * tip_init - Per thread initialization function
 */
static void tip_init(struct thr_info *tip)
{
	int i;

	INIT_LIST_HEAD(&tip->free_iocbs);
	INIT_LIST_HEAD(&tip->used_iocbs);

	pthread_mutex_init(&tip->mutex, NULL);
	pthread_cond_init(&tip->cond, NULL);

	if (io_setup(naios, &tip->ctx)) {
		fatal("io_setup", ERR_SYSCALL, "io_setup failed\n");
		/*NOTREACHED*/
	}

	tip->ofd = -1;
	tip->naios_out = 0;
	tip->send_done = tip->reap_done = 0;
	tip->send_wait = tip->reap_wait = 0;

	memset(&tip->sub_thread, 0, sizeof(tip->sub_thread));
	memset(&tip->rec_thread, 0, sizeof(tip->rec_thread));

	for (i = 0; i < naios; i++) {
		struct iocb_pkt *iocbp = buf_alloc(sizeof(*iocbp));

		iocb_init(tip, iocbp);
		list_add_tail(&iocbp->head, &tip->free_iocbs);
	}
	tip->naios_free = naios;

	if (verbose > 1) {
		char fn[MAXPATHLEN];

		sprintf(fn, "%s/%s.%s.%d.rep", idir, tip->devnm, ibase, 
			tip->cpu);
		tip->vfp = fopen(fn, "w");
		if (!tip->vfp) {
			fatal(fn, ERR_SYSCALL, "Failed to open report\n");
			/*NOTREACHED*/
		}

		setlinebuf(tip->vfp);
	}

	if (pthread_create(&tip->sub_thread, NULL, replay_sub, tip)) {
		fatal("pthread_create", ERR_SYSCALL, 
			"thread create failed\n");
		/*NOTREACHED*/
	}

	if (pthread_create(&tip->rec_thread, NULL, replay_rec, tip)) {
		fatal("pthread_create", ERR_SYSCALL, 
			"thread create failed\n");
		/*NOTREACHED*/
	}
}

/**
 * tip_release - Release resources associated with this thread
 */
static void tip_release(struct thr_info *tip)
{
	struct list_head *p, *q;

	assert(tip->send_done);
	assert(tip->reap_done);
	assert(list_len(&tip->used_iocbs) == 0);
	assert(tip->naios_free == naios);

	if (pthread_join(tip->sub_thread, NULL)) {
		fatal("pthread_join", ERR_SYSCALL, "pthread sub join failed\n");
		/*NOTREACHED*/
	}
	if (pthread_join(tip->rec_thread, NULL)) {
		fatal("pthread_join", ERR_SYSCALL, "pthread rec join failed\n");
		/*NOTREACHED*/
	}

	io_destroy(tip->ctx);

	list_splice(&tip->used_iocbs, &tip->free_iocbs);
	list_for_each_safe(p, q, &tip->free_iocbs) {
		struct iocb_pkt *iocbp = list_entry(p, struct iocb_pkt, head);

		list_del(&iocbp->head);
		if (iocbp->nbytes) 
			free(iocbp->iocb.u.c.buf);
		free(iocbp);
	}

	pthread_cond_destroy(&tip->cond);
	pthread_mutex_destroy(&tip->mutex);
}

/**
 * add_input_file - Allocate and initialize per-input file structure
 * @cpu: CPU for this file
 * @devnm: Device name for this file
 * @file_name: Fully qualifed input file name
 */
static void add_input_file(int cpu, char *devnm, char *file_name)
{
	struct stat buf;
	struct io_file_hdr hdr;
	struct thr_info *tip = buf_alloc(sizeof(*tip));
	__u64 my_version = mk_btversion(btver_mjr, btver_mnr, btver_sub);

	assert(0 <= cpu && cpu < ncpus);

	memset(&hdr, 0, sizeof(hdr));
	memset(tip, 0, sizeof(*tip));
	tip->cpu = cpu % cpus_to_use;
	tip->iterations = def_iterations;

	tip->ifd = open(file_name, O_RDONLY);
	if (tip->ifd < 0) {
		fatal(file_name, ERR_ARGS, "Unable to open\n");
		/*NOTREACHED*/
	}
	if (fstat(tip->ifd, &buf) < 0) {
		fatal(file_name, ERR_SYSCALL, "fstat failed\n");
		/*NOTREACHED*/
	}
	if (buf.st_size < (off_t)sizeof(hdr)) {
		if (verbose)
			fprintf(stderr, "\t%s empty\n", file_name);
		goto empty_file;
	}

	if (read(tip->ifd, &hdr, sizeof(hdr)) != sizeof(hdr)) {
		fatal(file_name, ERR_ARGS, "Header read failed\n");
		/*NOTREACHED*/
	}

	if (hdr.version != my_version) {
		fprintf(stderr, "%llx %llx %llx %llx\n", 
			(long long unsigned)hdr.version,
			(long long unsigned)hdr.genesis,
			(long long unsigned)hdr.nbunches,
			(long long unsigned)hdr.total_pkts);
		fatal(NULL, ERR_ARGS, 
			"BT version mismatch: %lx versus my %lx\n",
			(long)hdr.version, (long)my_version);
			
	}

	if (hdr.nbunches == 0) {
empty_file:
		close(tip->ifd);
		free(tip);
		return;
	}

	if (hdr.genesis < genesis) {
		if (verbose > 1)
			fprintf(stderr, "Setting genesis to %llu.%llu\n",
				du64_to_sec(hdr.genesis),
				du64_to_nsec(hdr.genesis));
		genesis = hdr.genesis;
	}

	tip->devnm = strdup(devnm);
	tip->file_name = strdup(file_name);

	list_add_tail(&tip->head, &input_files);

	if (verbose)
		fprintf(stderr, "Added %s %llu\n", file_name, 
			(long long)hdr.genesis);
}

/**
 * rem_input_file - Release resources associated with an input file
 * @tip: Per-input file information
 */
static void rem_input_file(struct thr_info *tip)
{
	list_del(&tip->head);

	tip_release(tip);

	close(tip->ofd);
	close(tip->ifd);
	free(tip->file_name);
	free(tip->devnm);
	free(tip);
}

/**
 * rem_input_files - Remove all input files
 */
static void rem_input_files(void)
{
	struct list_head *p, *q;

	list_for_each_safe(p, q, &input_files) {
		rem_input_file(list_entry(p, struct thr_info, head));
	}
}

/**
 * __find_input_files - Find input files associated with this device (per cpu)
 */
static void __find_input_files(struct dev_info *dip)
{
	int cpu = 0;

	for (;;) {
		char full_name[MAXPATHLEN];

		sprintf(full_name, "%s/%s.%s.%d", idir, dip->devnm, ibase, cpu);
		if (access(full_name, R_OK) != 0)
			break;

		add_input_file(cpu, dip->devnm, full_name);
		cpu++;
	}

	if (!cpu) {
		fatal(NULL, ERR_ARGS, "No traces found for %s\n", dip->devnm);
		/*NOTREACHED*/
	}

	rem_input_dev(dip);
}


/**
 * find_input_files - Find input files for all devices
 */
static void find_input_files(void)
{
	struct list_head *p, *q;

	list_for_each_safe(p, q, &input_devs) {
		__find_input_files(list_entry(p, struct dev_info, head));
	}
}

/* 
 * ========================================================================
 * ==== RECLAIM ROUTINES ==================================================
 * ========================================================================
 */

/**
 * reap_wait_aios - Wait for and return number of outstanding AIOs
 *
 * Will return 0 if we are done
 */
static int reap_wait_aios(struct thr_info *tip)
{
	int naios = 0;

	if (!is_reap_done(tip)) {
		pthread_mutex_lock(&tip->mutex);
		while (tip->naios_out == 0) {
			tip->reap_wait = 1;
			if (pthread_cond_wait(&tip->cond, &tip->mutex)) {
				fatal("pthread_cond_wait", ERR_SYSCALL, 
					"nfree_current cond wait failed\n");
				/*NOTREACHED*/
			}
		}
		naios = tip->naios_out;
		pthread_mutex_unlock(&tip->mutex);
	}
	assert(is_reap_done(tip) || naios > 0);

	return is_reap_done(tip) ? 0 : naios;
}

/**
 * reclaim_ios - Reclaim AIOs completed, recycle IOCBs
 * @tip: Per-thread information
 * @naios_out: Number of AIOs we have outstanding (min)
 */
static void reclaim_ios(struct thr_info *tip, long naios_out)
{
	long i, ndone;
	struct io_event *evp, events[naios_out];

again:
	assert(naios > 0);
	for (;;) {
		ndone = io_getevents(tip->ctx, 1, naios_out, events, NULL);
		if (ndone > 0)
			break;

		if (errno && errno != EINTR) {
			fatal("io_getevents", ERR_SYSCALL, 
				"io_getevents failed\n");
			/*NOTREACHED*/
		}
	}
	assert(0 < ndone && ndone <= naios_out);

	pthread_mutex_lock(&tip->mutex);
	for (i = 0, evp = events; i < ndone; i++, evp++) {
		struct iocb_pkt *iocbp = evp->data;

                if (evp->res != iocbp->iocb.u.c.nbytes) {
                        fatal(NULL, ERR_SYSCALL,
                              "Event failure %ld/%ld\t(%ld + %ld)\n",
                              (long)evp->res, (long)evp->res2,
                              (long)iocbp->iocb.u.c.offset / nb_sec, 
			      (long)iocbp->iocb.u.c.nbytes / nb_sec);
                        /*NOTREACHED*/
                }

		list_move_tail(&iocbp->head, &tip->free_iocbs);
	}

	tip->naios_free += ndone;
	tip->naios_out -= ndone;
	naios_out = minl(naios_out, tip->naios_out);

	if (tip->send_wait) {
		tip->send_wait = 0;
		pthread_cond_signal(&tip->cond);
	}
	pthread_mutex_unlock(&tip->mutex);

	/*
	 * Short cut: If we /know/ there are some more AIOs, go handle them
	 */
	if (naios_out)
		goto again;
}

/**
 * replay_rec - Worker thread to reclaim AIOs
 * @arg: Pointer to thread information
 */
static void *replay_rec(void *arg)
{
	long naios_out;
	struct thr_info *tip = arg;

	while ((naios_out = reap_wait_aios(tip)) > 0) 
		reclaim_ios(tip, naios_out);

	assert(tip->send_done);
	tip->reap_done = 1;
	set_reclaim_done();

	return NULL;
}

/* 
 * ========================================================================
 * ==== REPLAY ROUTINES ===================================================
 * ========================================================================
 */

/**
 * next_bunch - Retrieve next bunch of AIOs to process
 * @tip: Per-thread information
 * @bunch: Bunch information
 *
 * Returns TRUE if we recovered a bunch of IOs, else hit EOF
 */
static int next_bunch(struct thr_info *tip, struct io_bunch *bunch)
{
	size_t count, result;
	
	result = read(tip->ifd, &bunch->hdr, sizeof(bunch->hdr));
	if (result != sizeof(bunch->hdr)) {
		if (result == 0)
			return 0;

		fatal(tip->file_name, ERR_SYSCALL, "Short hdr(%ld)\n", 
			(long)result);
		/*NOTREACHED*/
	}
	assert(bunch->hdr.npkts <= BT_MAX_PKTS);

	count = bunch->hdr.npkts * sizeof(struct io_pkt);
	result = read(tip->ifd, &bunch->pkts, count);
	if (result != count) {
		fatal(tip->file_name, ERR_SYSCALL, "Short pkts(%ld/%ld)\n", 
			(long)result, (long)count);
		/*NOTREACHED*/
	}

	return 1;
}

/**
 * nfree_current - Returns current number of AIOs that are free
 *
 * Will wait for available ones...
 *
 * Returns 0 if we have some condition that causes us to exit
 */
static int nfree_current(struct thr_info *tip)
{
	int nfree = 0;

	pthread_mutex_lock(&tip->mutex);
	while (!is_send_done(tip) && ((nfree = tip->naios_free) == 0)) {
		tip->send_wait = 1;
		if (pthread_cond_wait(&tip->cond, &tip->mutex)) {
			fatal("pthread_cond_wait", ERR_SYSCALL, 
				"nfree_current cond wait failed\n");
			/*NOTREACHED*/
		}
	}
	pthread_mutex_unlock(&tip->mutex);

	return nfree;
}

/**
 * stall - Stall for the number of nanoseconds requested
 *
 * We may be late, in which case we just return.
 */
static void stall(struct thr_info *tip, long long oclock)
{
	struct timespec req;
	long long dreal, tclock = gettime() - rgenesis;

	oclock /= acc_factor;
	
	if (verbose > 1)
		fprintf(tip->vfp, "   stall(%lld.%09lld, %lld.%09lld)\n",
			du64_to_sec(oclock), du64_to_nsec(oclock),
			du64_to_sec(tclock), du64_to_nsec(tclock));

	while (!is_send_done(tip) && tclock < oclock) {
		dreal = oclock - tclock;
		req.tv_sec = dreal / (1000 * 1000 * 1000);
		req.tv_nsec = dreal % (1000 * 1000 * 1000);

		if (verbose > 1) {
			fprintf(tip->vfp, "++ stall(%lld.%09lld) ++\n",
				(long long)req.tv_sec,
				(long long)req.tv_nsec);
		}

		if (nanosleep(&req, NULL) < 0 && signal_done)
			break;

		tclock = gettime() - rgenesis;
	}
}

/**
 * iocbs_map - Map a set of AIOs onto a set of IOCBs
 * @tip: Per-thread information
 * @list: List of AIOs created
 * @pkts: AIOs to map
 * @ntodo: Number of AIOs to map
 */
static void iocbs_map(struct thr_info *tip, struct iocb **list, 
					     struct io_pkt *pkts, int ntodo)
{
	int i;
	struct io_pkt *pkt;

	assert(0 < ntodo && ntodo <= naios);

	pthread_mutex_lock(&tip->mutex);
	assert(ntodo <= list_len(&tip->free_iocbs));
	for (i = 0, pkt = pkts; i < ntodo; i++, pkt++) {
		__u32 rw = pkt->rw;
		struct iocb_pkt *iocbp;

		if (!pkt->rw && !write_enabled)
			rw = 1;

		if (verbose > 1)
			fprintf(tip->vfp, "\t%10llu + %10llu %c%c\n",
				(unsigned long long)pkt->sector, 
				(unsigned long long)pkt->nbytes / nb_sec,
				rw ? 'R' : 'W', 
				(rw == 1 && pkt->rw == 0) ? '!' : ' ');
		
		iocbp = list_entry(tip->free_iocbs.next, struct iocb_pkt, head);
		iocb_setup(iocbp, rw, pkt->nbytes, pkt->sector * nb_sec);

		list_move_tail(&iocbp->head, &tip->used_iocbs);
		list[i] = &iocbp->iocb;
	}

	tip->naios_free -= ntodo;
	assert(tip->naios_free >= 0);
	pthread_mutex_unlock(&tip->mutex);
}

/**
 * process_bunch - Process a bunch of requests
 * @tip: Per-thread information
 * @bunch: Bunch to process
 */
static void process_bunch(struct thr_info *tip, struct io_bunch *bunch)
{
	__u64 i = 0;
	struct iocb *list[bunch->hdr.npkts];

	assert(0 < bunch->hdr.npkts && bunch->hdr.npkts <= BT_MAX_PKTS);
	while (!is_send_done(tip) && (i < bunch->hdr.npkts)) {
		long ndone;
		int ntodo = min(nfree_current(tip), bunch->hdr.npkts - i);

		assert(0 < ntodo && ntodo <= naios);
		iocbs_map(tip, list, &bunch->pkts[i], ntodo);
		if (!no_stalls)
			stall(tip, bunch->hdr.time_stamp - genesis);

		if (ntodo) {
			if (verbose > 1)
				fprintf(tip->vfp, "submit(%d)\n", ntodo);
			ndone = io_submit(tip->ctx, ntodo, list);
			if (ndone != (long)ntodo) {
				fatal("io_submit", ERR_SYSCALL,
					"%d: io_submit(%d:%ld) failed (%s)\n", 
					tip->cpu, ntodo, ndone, 
					strerror(labs(ndone)));
				/*NOTREACHED*/
			}

			pthread_mutex_lock(&tip->mutex);
			tip->naios_out += ndone;
			assert(tip->naios_out <= naios);
			if (tip->reap_wait) {
				tip->reap_wait = 0;
				pthread_cond_signal(&tip->cond);
			}
			pthread_mutex_unlock(&tip->mutex);

			i += ndone;
			assert(i <= bunch->hdr.npkts);
		}
	}
}

/**
 * reset_input_file - Reset the input file for the next iteration
 * @tip: Thread information
 *
 * We also do a dummy read of the file header to get us to the first bunch.
 */
static void reset_input_file(struct thr_info *tip)
{
	struct io_file_hdr hdr;

	lseek(tip->ifd, 0, 0);

	if (read(tip->ifd, &hdr, sizeof(hdr)) != sizeof(hdr)) {
		fatal(tip->file_name, ERR_ARGS, "Header reread failed\n");
		/*NOTREACHED*/
	}
}

/**
 * replay_sub - Worker thread to submit AIOs that are being replayed
 */
static void *replay_sub(void *arg)
{
        unsigned int i;
	char *mdev;
	char path[MAXPATHLEN];
	struct io_bunch bunch;
	struct thr_info *tip = arg;
	int oflags;

	pin_to_cpu(tip);

	mdev = map_dev(tip->devnm);
	sprintf(path, "/dev/%s", mdev);
	/*
	 * convert underscores to slashes to
	 * restore device names that have larger paths
	 */
	for (i = 0; i < strlen(mdev); i++)
	        if (path[strlen("/dev/") + i] == '_')
		        path[strlen("/dev/") + i] = '/';
#ifdef O_NOATIME
	oflags = O_NOATIME;
#else
	oflags = 0;
#endif
	tip->ofd = open(path, O_RDWR | O_DIRECT | oflags);
	if (tip->ofd < 0) {
		fatal(path, ERR_SYSCALL, "Failed device open\n");
		/*NOTREACHED*/
	}

	set_replay_ready();
	while (!is_send_done(tip) && tip->iterations--) {
		wait_iter_start();
		if (verbose > 1)
			fprintf(tip->vfp, "\n=== %d ===\n", tip->iterations);
		while (!is_send_done(tip) && next_bunch(tip, &bunch))
			process_bunch(tip, &bunch);
		set_iter_done();
		reset_input_file(tip);
	}
	tip->send_done = 1;
	set_replay_done();

	return NULL;
}

/* 
 * ========================================================================
 * ==== COMMAND LINE ARGUMENT HANDLING ====================================
 * ========================================================================
 */

static char usage_str[] = 						\
        "\n"								\
        "\t[ -c <cpus> : --cpus=<cpus>           ] Default: 1\n"        \
        "\t[ -d <dir>  : --input-directory=<dir> ] Default: .\n"        \
	"\t[ -F        : --find-records          ] Default: Off\n"	\
        "\t[ -h        : --help                  ] Default: Off\n"      \
        "\t[ -i <base> : --input-base=<base>     ] Default: replay\n"   \
        "\t[ -I <iters>: --iterations=<iters>    ] Default: 1\n"        \
        "\t[ -M <file> : --map-devs=<file>       ] Default: None\n"     \
        "\t[ -N        : --no-stalls             ] Default: Off\n"      \
        "\t[ -x        : --acc-factor            ] Default: 1\n"	\
        "\t[ -v        : --verbose               ] Default: Off\n"      \
        "\t[ -V        : --version               ] Default: Off\n"      \
        "\t[ -W        : --write-enable          ] Default: Off\n"      \
        "\t<dev...>                                Default: None\n"     \
        "\n";

#define S_OPTS	"c:d:Fhi:I:M:Nx:t:vVW"
static struct option l_opts[] = {
	{
		.name = "cpus",
		.has_arg = required_argument,
		.flag = NULL,
		.val = 'c'
	},
	{
		.name = "input-directory",
		.has_arg = required_argument,
		.flag = NULL,
		.val = 'd'
	},
	{
		.name = "find-records",
		.has_arg = no_argument,
		.flag = NULL,
		.val = 'F'
	},
	{
		.name = "help",
		.has_arg = no_argument,
		.flag = NULL,
		.val = 'h'
	},
	{
		.name = "input-base",
		.has_arg = required_argument,
		.flag = NULL,
		.val = 'i'
	},
	{
		.name = "iterations",
		.has_arg = required_argument,
		.flag = NULL,
		.val = 'I'
	},
	{
		.name = "map-devs",
		.has_arg = required_argument,
		.flag = NULL,
		.val = 'M'
	},
	{
		.name = "no-stalls",
		.has_arg = no_argument,
		.flag = NULL,
		.val = 'N'
	},
	{
		.name = "acc-factor",
		.has_arg = required_argument,
		.flag = NULL,
		.val = 'x'
	},
	{
		.name = "verbose",
		.has_arg = no_argument,
		.flag = NULL,
		.val = 'v'
	},
	{
		.name = "version",
		.has_arg = no_argument,
		.flag = NULL,
		.val = 'V'
	},
	{
		.name = "write-enable",
		.has_arg = no_argument,
		.flag = NULL,
		.val = 'W'
	},
	{
		.name = NULL
	}
};

/**
 * handle_args: Parse passed in argument list
 * @argc: Number of arguments in argv
 * @argv: Arguments passed in
 *
 * Does rudimentary parameter verification as well.
 */
static void handle_args(int argc, char *argv[])
{
	int c;
	int r;

	while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
		switch (c) {
		case 'c': 
			cpus_to_use = atoi(optarg);
			if (cpus_to_use <= 0 || cpus_to_use > ncpus) {
				fatal(NULL, ERR_ARGS, 
				      "Invalid number of cpus %d (0<x<%d)\n",
				      cpus_to_use, ncpus);
				/*NOTREACHED*/
			}
			break;

		case 'd':
			idir = optarg;
			if (access(idir, R_OK | X_OK) != 0) {
				fatal(idir, ERR_ARGS, 
				      "Invalid input directory specified\n");
				/*NOTREACHED*/
			}
			break;

		case 'F': 
			find_records = 1;
			break;

		case 'h': 
			usage(); 
			exit(0);
			/*NOTREACHED*/

		case 'i': 
			ibase = optarg;
			break;

		case 'I':
			def_iterations = atoi(optarg);
			if (def_iterations <= 0) {
				fprintf(stderr, 
					"Invalid number of iterations %d\n",
					def_iterations);
				exit(ERR_ARGS);
				/*NOTREACHED*/
			}
			break;

		case 'M':
			read_map_devs(optarg);
			break;

		case 'N':
			no_stalls = 1;
			break;

		case 'x':
			r = sscanf(optarg,"%u",&acc_factor);
			if (r!=1) {
				fprintf(stderr,
					"Invalid acceleration factor\n");
				exit(ERR_ARGS);
				/*NOTREACHED*/
			}
			break;

		case 'V':
			fprintf(stderr, "btreplay -- version %s\n", 
				my_btversion);
			exit(0);
			/*NOTREACHED*/

		case 'v':
			verbose++;
			break;

		case 'W':
			write_enabled = 1;
			break;

		default:
			usage();
			fatal(NULL, ERR_ARGS, 
			      "Invalid command line argument %c\n", c);
			/*NOTREACHED*/
		}
	}

	while (optind < argc)
		add_input_dev(argv[optind++]);

	if (find_records)
		find_input_devs(idir);

	if (list_len(&input_devs) == 0) {
		fatal(NULL, ERR_ARGS, "Missing required input dev name(s)\n");
		/*NOTREACHED*/
	}

	if (cpus_to_use < 0)
		cpus_to_use = ncpus;
}

/* 
 * ========================================================================
 * ==== MAIN ROUTINE ======================================================
 * ========================================================================
 */

/**
 * set_signal_done - Signal handler, catches signals & sets signal_done
 */
static void set_signal_done(__attribute__((__unused__))int signum)
{
	signal_done = 1;
}

/**
 * main - 
 * @argc: Number of arguments
 * @argv: Array of arguments
 */
int main(int argc, char *argv[])
{
	int i;
	struct list_head *p;

	pgsize = getpagesize();
	assert(pgsize > 0);

	setup_signal(SIGINT, set_signal_done);
	setup_signal(SIGTERM, set_signal_done);

	get_ncpus();
	handle_args(argc, argv);
	find_input_files();

	nfiles = list_len(&input_files);
	__list_for_each(p, &input_files) {
		tip_init(list_entry(p, struct thr_info, head));
	}

	wait_replays_ready();
	for (i = 0; i < def_iterations; i++) {
		rgenesis = gettime();
		start_iter();
		if (verbose)
			fprintf(stderr, "I");
		wait_iters_done();
	}

	wait_replays_done();
	wait_reclaims_done();

	if (verbose)
		fprintf(stderr, "\n");

	rem_input_files();
	release_map_devs();

	return 0;
}