/* * Blktrace replay utility - Play traces back * * Copyright (C) 2007 Alan D. Brunelle <Alan.Brunelle@hp.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <assert.h> #include <errno.h> #include <fcntl.h> #include <libaio.h> #include <pthread.h> #include <sched.h> #include <signal.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <time.h> #include <unistd.h> #include <sys/param.h> #include <sys/stat.h> #include <sys/time.h> #include <sys/types.h> #include <dirent.h> #include <stdarg.h> #if !defined(_GNU_SOURCE) # define _GNU_SOURCE #endif #include <getopt.h> #include "list.h" #include "btrecord.h" /* * ======================================================================== * ==== STRUCTURE DEFINITIONS ============================================= * ======================================================================== */ /** * Each device map has one of these: * * @head: Linked on to map_devs * @from_dev: Device name as seen on recorded system * @to_dev: Device name to be used on replay system */ struct map_dev { struct list_head head; char *from_dev, *to_dev; }; /** * Each device name specified has one of these (until threads are created) * * @head: Linked onto input_devs * @devnm: Device name -- 'sd*' */ struct dev_info { struct list_head head; char *devnm; }; /* * Per input file information * * @head: Used to link up on input_files * @free_iocbs: List of free iocb's available for use * @used_iocbs: List of iocb's currently outstanding * @mutex: Mutex used with condition variable to protect volatile values * @cond: Condition variable used when waiting on a volatile value change * @naios_out: Current number of AIOs outstanding on this context * @naios_free: Number of AIOs on the free list (short cut for list_len) * @send_wait: Boolean: When true, the sub thread is waiting on free IOCBs * @reap_wait: Boolean: When true, the rec thread is waiting on used IOCBs * @send_done: Boolean: When true, the sub thread has completed work * @reap_done: Boolean: When true, the rec thread has completed work * @sub_thread: Thread used to submit IOs. * @rec_thread: Thread used to reclaim IOs. * @ctx: IO context * @devnm: Copy of the device name being managed by this thread * @file_name: Full name of the input file * @cpu: CPU this thread is pinned to * @ifd: Input file descriptor * @ofd: Output file descriptor * @iterations: Remaining iterations to process * @vfp: For verbose dumping of actions performed */ struct thr_info { struct list_head head, free_iocbs, used_iocbs; pthread_mutex_t mutex; pthread_cond_t cond; volatile long naios_out, naios_free; volatile int send_wait, reap_wait, send_done, reap_done; pthread_t sub_thread, rec_thread; io_context_t ctx; char *devnm, *file_name; int cpu, ifd, ofd, iterations; FILE *vfp; }; /* * Every Asynchronous IO used has one of these (naios per file/device). * * @iocb: IOCB sent down via io_submit * @head: Linked onto file_list.free_iocbs or file_list.used_iocbs * @tip: Pointer to per-thread information this IO is associated with * @nbytes: Number of bytes in buffer associated with iocb */ struct iocb_pkt { struct iocb iocb; struct list_head head; struct thr_info *tip; int nbytes; }; /* * ======================================================================== * ==== GLOBAL VARIABLES ================================================== * ======================================================================== */ static volatile int signal_done = 0; // Boolean: Signal'ed, need to quit static char *ibase = "replay"; // Input base name static char *idir = "."; // Input directory base static int cpus_to_use = -1; // Number of CPUs to use static int def_iterations = 1; // Default number of iterations static int naios = 512; // Number of AIOs per thread static int ncpus = 0; // Number of CPUs in the system static int verbose = 0; // Boolean: Output some extra info static int write_enabled = 0; // Boolean: Enable writing static __u64 genesis = ~0; // Earliest time seen static __u64 rgenesis; // Our start time static size_t pgsize; // System Page size static int nb_sec = 512; // Number of bytes per sector static LIST_HEAD(input_devs); // List of devices to handle static LIST_HEAD(input_files); // List of input files to handle static LIST_HEAD(map_devs); // List of device maps static int nfiles = 0; // Number of files to handle static int no_stalls = 0; // Boolean: Disable pre-stalls static unsigned acc_factor = 1; // Int: Acceleration factor static int find_records = 0; // Boolean: Find record files auto /* * Variables managed under control of condition variables. * * n_reclaims_done: Counts number of reclaim threads that have completed. * n_replays_done: Counts number of replay threads that have completed. * n_replays_ready: Counts number of replay threads ready to start. * n_iters_done: Counts number of replay threads done one iteration. * iter_start: Starts an iteration for the replay threads. */ static volatile int n_reclaims_done = 0; static pthread_mutex_t reclaim_done_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t reclaim_done_cond = PTHREAD_COND_INITIALIZER; static volatile int n_replays_done = 0; static pthread_mutex_t replay_done_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t replay_done_cond = PTHREAD_COND_INITIALIZER; static volatile int n_replays_ready = 0; static pthread_mutex_t replay_ready_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t replay_ready_cond = PTHREAD_COND_INITIALIZER; static volatile int n_iters_done = 0; static pthread_mutex_t iter_done_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t iter_done_cond = PTHREAD_COND_INITIALIZER; static volatile int iter_start = 0; static pthread_mutex_t iter_start_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t iter_start_cond = PTHREAD_COND_INITIALIZER; /* * ======================================================================== * ==== FORWARD REFERENECES =============================================== * ======================================================================== */ static void *replay_sub(void *arg); static void *replay_rec(void *arg); static char usage_str[]; /* * ======================================================================== * ==== INLINE ROUTINES =================================================== * ======================================================================== */ /* * The 'fatal' macro will output a perror message (if errstring is !NULL) * and display a string (with variable arguments) and then exit with the * specified exit value. */ #define ERR_ARGS 1 #define ERR_SYSCALL 2 static inline void fatal(const char *errstring, const int exitval, const char *fmt, ...) { va_list ap; if (errstring) perror(errstring); va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); exit(exitval); /*NOTREACHED*/ } static inline long long unsigned du64_to_sec(__u64 du64) { return (long long unsigned)du64 / (1000 * 1000 * 1000); } static inline long long unsigned du64_to_nsec(__u64 du64) { return llabs((long long)du64) % (1000 * 1000 * 1000); } /** * min - Return minimum of two integers */ static inline int min(int a, int b) { return a < b ? a : b; } /** * minl - Return minimum of two longs */ static inline long minl(long a, long b) { return a < b ? a : b; } /** * usage - Display usage string and version */ static inline void usage(void) { fprintf(stderr, "Usage: btreplay -- version %s\n%s", my_btversion, usage_str); } /** * is_send_done - Returns true if sender should quit early * @tip: Per-thread information */ static inline int is_send_done(struct thr_info *tip) { return signal_done || tip->send_done; } /** * is_reap_done - Returns true if reaper should quit early * @tip: Per-thread information */ static inline int is_reap_done(struct thr_info *tip) { return signal_done || (tip->send_done && tip->naios_out == 0); } /** * ts2ns - Convert timespec values to a nanosecond value */ #define NS_TICKS ((__u64)1000 * (__u64)1000 * (__u64)1000) static inline __u64 ts2ns(struct timespec *ts) { return ((__u64)(ts->tv_sec) * NS_TICKS) + (__u64)(ts->tv_nsec); } /** * ts2ns - Convert timeval values to a nanosecond value */ static inline __u64 tv2ns(struct timeval *tp) { return ((__u64)(tp->tv_sec)) + ((__u64)(tp->tv_usec) * (__u64)1000); } /** * touch_memory - Force physical memory to be allocating it * * For malloc()ed memory we need to /touch/ it to make it really * exist. Otherwise, for write's (to storage) things may not work * as planned - we see Linux just use a single area to /read/ from * (as there isn't any memory that has been associated with the * allocated virtual addresses yet). */ static inline void touch_memory(char *buf, size_t bsize) { #if defined(PREP_BUFS) memset(buf, 0, bsize); #else size_t i; for (i = 0; i < bsize; i += pgsize) buf[i] = 0; #endif } /** * buf_alloc - Returns a page-aligned buffer of the specified size * @nbytes: Number of bytes to allocate */ static inline void *buf_alloc(size_t nbytes) { void *buf; if (posix_memalign(&buf, pgsize, nbytes)) { fatal("posix_memalign", ERR_SYSCALL, "Allocation failed\n"); /*NOTREACHED*/ } return buf; } /** * gettime - Returns current time */ static inline __u64 gettime(void) { static int use_clock_gettime = -1; // Which clock to use if (use_clock_gettime < 0) { use_clock_gettime = clock_getres(CLOCK_MONOTONIC, NULL) == 0; if (use_clock_gettime) { struct timespec ts = { .tv_sec = 0, .tv_nsec = 0 }; clock_settime(CLOCK_MONOTONIC, &ts); } } if (use_clock_gettime) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return ts2ns(&ts); } else { struct timeval tp; gettimeofday(&tp, NULL); return tv2ns(&tp); } } /** * setup_signal - Set up a signal handler for the specified signum */ static inline void setup_signal(int signum, sighandler_t handler) { if (signal(signum, handler) == SIG_ERR) { fatal("signal", ERR_SYSCALL, "Failed to set signal %d\n", signum); /*NOTREACHED*/ } } /* * ======================================================================== * ==== CONDITION VARIABLE ROUTINES ======================================= * ======================================================================== */ /** * __set_cv - Increments a variable under condition variable control. * @pmp: Pointer to the associated mutex * @pcp: Pointer to the associated condition variable * @vp: Pointer to the variable being incremented * @mxv: Max value for variable (Used only when ASSERTS are on) */ static inline void __set_cv(pthread_mutex_t *pmp, pthread_cond_t *pcp, volatile int *vp, __attribute__((__unused__))int mxv) { pthread_mutex_lock(pmp); assert(*vp < mxv); *vp += 1; pthread_cond_signal(pcp); pthread_mutex_unlock(pmp); } /** * __wait_cv - Waits for a variable under cond var control to hit a value * @pmp: Pointer to the associated mutex * @pcp: Pointer to the associated condition variable * @vp: Pointer to the variable being incremented * @mxv: Value to wait for */ static inline void __wait_cv(pthread_mutex_t *pmp, pthread_cond_t *pcp, volatile int *vp, int mxv) { pthread_mutex_lock(pmp); while (*vp < mxv) pthread_cond_wait(pcp, pmp); *vp = 0; pthread_mutex_unlock(pmp); } static inline void set_reclaim_done(void) { __set_cv(&reclaim_done_mutex, &reclaim_done_cond, &n_reclaims_done, nfiles); } static inline void wait_reclaims_done(void) { __wait_cv(&reclaim_done_mutex, &reclaim_done_cond, &n_reclaims_done, nfiles); } static inline void set_replay_ready(void) { __set_cv(&replay_ready_mutex, &replay_ready_cond, &n_replays_ready, nfiles); } static inline void wait_replays_ready(void) { __wait_cv(&replay_ready_mutex, &replay_ready_cond, &n_replays_ready, nfiles); } static inline void set_replay_done(void) { __set_cv(&replay_done_mutex, &replay_done_cond, &n_replays_done, nfiles); } static inline void wait_replays_done(void) { __wait_cv(&replay_done_mutex, &replay_done_cond, &n_replays_done, nfiles); } static inline void set_iter_done(void) { __set_cv(&iter_done_mutex, &iter_done_cond, &n_iters_done, nfiles); } static inline void wait_iters_done(void) { __wait_cv(&iter_done_mutex, &iter_done_cond, &n_iters_done, nfiles); } /** * wait_iter_start - Wait for an iteration to start * * This is /slightly/ different: we are waiting for a value to become * non-zero, and then we decrement it and go on. */ static inline void wait_iter_start(void) { pthread_mutex_lock(&iter_start_mutex); while (iter_start == 0) pthread_cond_wait(&iter_start_cond, &iter_start_mutex); assert(1 <= iter_start && iter_start <= nfiles); iter_start--; pthread_mutex_unlock(&iter_start_mutex); } /** * start_iter - Start an iteration at the replay thread level */ static inline void start_iter(void) { pthread_mutex_lock(&iter_start_mutex); assert(iter_start == 0); iter_start = nfiles; pthread_cond_broadcast(&iter_start_cond); pthread_mutex_unlock(&iter_start_mutex); } /* * ======================================================================== * ==== CPU RELATED ROUTINES ============================================== * ======================================================================== */ /** * get_ncpus - Sets up the global 'ncpus' value */ static void get_ncpus(void) { #ifdef _SC_NPROCESSORS_ONLN ncpus = sysconf(_SC_NPROCESSORS_ONLN); #else int nrcpus = 4096; cpu_set_t * cpus; realloc: cpus = CPU_ALLOC(nrcpus); size = CPU_ALLOC_SIZE(nrcpus); CPU_ZERO_S(size, cpus); if (sched_getaffinity(0, size, cpus)) { if( errno == EINVAL && nrcpus < (4096<<4) ) { CPU_FREE(cpus); nrcpus <<= 1; goto realloc; } fatal("sched_getaffinity", ERR_SYSCALL, "Can't get CPU info\n"); /*NOTREACHED*/ } ncpus = -1; for (last_cpu = 0; last_cpu < CPU_SETSIZE && CPU_ISSET(last_cpu, &cpus); last_cpu++) if (CPU_ISSET( last_cpu, &cpus) ) ncpus = last_cpu; ncpus++; CPU_FREE(cpus); #endif if (ncpus == 0) { fatal(NULL, ERR_SYSCALL, "Insufficient number of CPUs\n"); /*NOTREACHED*/ } } /** * pin_to_cpu - Pin this thread to a specific CPU * @tip: Thread information */ static void pin_to_cpu(struct thr_info *tip) { cpu_set_t *cpus; size_t size; cpus = CPU_ALLOC(ncpus); size = CPU_ALLOC_SIZE(ncpus); assert(0 <= tip->cpu && tip->cpu < ncpus); CPU_ZERO_S(size, cpus); CPU_SET_S(tip->cpu, size, cpus); if (sched_setaffinity(0, size, cpus)) { fatal("sched_setaffinity", ERR_SYSCALL, "Failed to pin CPU\n"); /*NOTREACHED*/ } assert(tip->cpu == sched_getcpu()); if (verbose > 1) { int i; cpu_set_t *now = CPU_ALLOC(ncpus); (void)sched_getaffinity(0, size, now); fprintf(tip->vfp, "Pinned to CPU %02d ", tip->cpu); for (i = 0; i < ncpus; i++) fprintf(tip->vfp, "%1d", CPU_ISSET_S(i, size, now)); fprintf(tip->vfp, "\n"); } } /* * ======================================================================== * ==== INPUT DEVICE HANDLERS ============================================= * ======================================================================== */ /** * add_input_dev - Add a device ('sd*') to the list of devices to handle */ static void add_input_dev(char *devnm) { struct list_head *p; struct dev_info *dip; __list_for_each(p, &input_devs) { dip = list_entry(p, struct dev_info, head); if (strcmp(dip->devnm, devnm) == 0) return; } dip = malloc(sizeof(*dip)); dip->devnm = strdup(devnm); list_add_tail(&dip->head, &input_devs); } /** * rem_input_dev - Remove resources associated with this device */ static void rem_input_dev(struct dev_info *dip) { list_del(&dip->head); free(dip->devnm); free(dip); } static void find_input_devs(char *idir) { struct dirent *ent; DIR *dir = opendir(idir); if (dir == NULL) { fatal(idir, ERR_ARGS, "Unable to open %s\n", idir); /*NOTREACHED*/ } while ((ent = readdir(dir)) != NULL) { char *p, *dsf; if (strstr(ent->d_name, ".replay.") == NULL) continue; dsf = strdup(ent->d_name); p = index(dsf, '.'); assert(p != NULL); *p = '\0'; add_input_dev(dsf); free(dsf); } closedir(dir); } /* * ======================================================================== * ==== MAP DEVICE INTERFACES ============================================= * ======================================================================== */ /** * read_map_devs - Read in a set of device mapping from the provided file. * @file_name: File containing device maps * * We support the notion of multiple such files being specifed on the cmd line */ static void read_map_devs(char *file_name) { FILE *fp; char from_dev[256], to_dev[256]; fp = fopen(file_name, "r"); if (!fp) { fatal(file_name, ERR_SYSCALL, "Could not open map devs file\n"); /*NOTREACHED*/ } while (fscanf(fp, "%s %s", from_dev, to_dev) == 2) { struct map_dev *mdp = malloc(sizeof(*mdp)); mdp->from_dev = from_dev; mdp->to_dev = to_dev; list_add_tail(&mdp->head, &map_devs); } fclose(fp); } /** * release_map_devs - Release resources associated with device mappings. */ static void release_map_devs(void) { struct list_head *p, *q; list_for_each_safe(p, q, &map_devs) { struct map_dev *mdp = list_entry(p, struct map_dev, head); list_del(&mdp->head); free(mdp->from_dev); free(mdp->to_dev); free(mdp); } } /** * map_dev - Return the mapped device for that specified * @from_dev: Device name as seen on recorded system * * Note: If there is no such mapping, we return the same name. */ static char *map_dev(char *from_dev) { struct list_head *p; __list_for_each(p, &map_devs) { struct map_dev *mdp = list_entry(p, struct map_dev, head); if (strcmp(from_dev, mdp->from_dev) == 0) return mdp->to_dev; } return from_dev; } /* * ======================================================================== * ==== IOCB MANAGEMENT ROUTINES ========================================== * ======================================================================== */ /** * iocb_init - Initialize the fields of an IOCB * @tip: Per-thread information * iocbp: IOCB pointer to update */ static void iocb_init(struct thr_info *tip, struct iocb_pkt *iocbp) { iocbp->tip = tip; iocbp->nbytes = 0; iocbp->iocb.u.c.buf = NULL; } /** * iocb_setup - Set up an iocb with this AIOs information * @iocbp: IOCB pointer to update * @rw: Direction (0 == write, 1 == read) * @n: Number of bytes to transfer * @off: Offset (in bytes) */ static void iocb_setup(struct iocb_pkt *iocbp, int rw, int n, long long off) { char *buf; struct iocb *iop = &iocbp->iocb; assert(rw == 0 || rw == 1); assert(0 < n && (n % nb_sec) == 0); assert(0 <= off); if (iocbp->nbytes) { if (iocbp->nbytes >= n) { buf = iop->u.c.buf; goto prep; } assert(iop->u.c.buf); free(iop->u.c.buf); } buf = buf_alloc(n); iocbp->nbytes = n; prep: if (rw) io_prep_pread(iop, iocbp->tip->ofd, buf, n, off); else { assert(write_enabled); io_prep_pwrite(iop, iocbp->tip->ofd, buf, n, off); touch_memory(buf, n); } iop->data = iocbp; } /* * ======================================================================== * ==== PER-THREAD SET UP & TEAR DOWN ===================================== * ======================================================================== */ /** * tip_init - Per thread initialization function */ static void tip_init(struct thr_info *tip) { int i; INIT_LIST_HEAD(&tip->free_iocbs); INIT_LIST_HEAD(&tip->used_iocbs); pthread_mutex_init(&tip->mutex, NULL); pthread_cond_init(&tip->cond, NULL); if (io_setup(naios, &tip->ctx)) { fatal("io_setup", ERR_SYSCALL, "io_setup failed\n"); /*NOTREACHED*/ } tip->ofd = -1; tip->naios_out = 0; tip->send_done = tip->reap_done = 0; tip->send_wait = tip->reap_wait = 0; memset(&tip->sub_thread, 0, sizeof(tip->sub_thread)); memset(&tip->rec_thread, 0, sizeof(tip->rec_thread)); for (i = 0; i < naios; i++) { struct iocb_pkt *iocbp = buf_alloc(sizeof(*iocbp)); iocb_init(tip, iocbp); list_add_tail(&iocbp->head, &tip->free_iocbs); } tip->naios_free = naios; if (verbose > 1) { char fn[MAXPATHLEN]; sprintf(fn, "%s/%s.%s.%d.rep", idir, tip->devnm, ibase, tip->cpu); tip->vfp = fopen(fn, "w"); if (!tip->vfp) { fatal(fn, ERR_SYSCALL, "Failed to open report\n"); /*NOTREACHED*/ } setlinebuf(tip->vfp); } if (pthread_create(&tip->sub_thread, NULL, replay_sub, tip)) { fatal("pthread_create", ERR_SYSCALL, "thread create failed\n"); /*NOTREACHED*/ } if (pthread_create(&tip->rec_thread, NULL, replay_rec, tip)) { fatal("pthread_create", ERR_SYSCALL, "thread create failed\n"); /*NOTREACHED*/ } } /** * tip_release - Release resources associated with this thread */ static void tip_release(struct thr_info *tip) { struct list_head *p, *q; assert(tip->send_done); assert(tip->reap_done); assert(list_len(&tip->used_iocbs) == 0); assert(tip->naios_free == naios); if (pthread_join(tip->sub_thread, NULL)) { fatal("pthread_join", ERR_SYSCALL, "pthread sub join failed\n"); /*NOTREACHED*/ } if (pthread_join(tip->rec_thread, NULL)) { fatal("pthread_join", ERR_SYSCALL, "pthread rec join failed\n"); /*NOTREACHED*/ } io_destroy(tip->ctx); list_splice(&tip->used_iocbs, &tip->free_iocbs); list_for_each_safe(p, q, &tip->free_iocbs) { struct iocb_pkt *iocbp = list_entry(p, struct iocb_pkt, head); list_del(&iocbp->head); if (iocbp->nbytes) free(iocbp->iocb.u.c.buf); free(iocbp); } pthread_cond_destroy(&tip->cond); pthread_mutex_destroy(&tip->mutex); } /** * add_input_file - Allocate and initialize per-input file structure * @cpu: CPU for this file * @devnm: Device name for this file * @file_name: Fully qualifed input file name */ static void add_input_file(int cpu, char *devnm, char *file_name) { struct stat buf; struct io_file_hdr hdr; struct thr_info *tip = buf_alloc(sizeof(*tip)); __u64 my_version = mk_btversion(btver_mjr, btver_mnr, btver_sub); assert(0 <= cpu && cpu < ncpus); memset(&hdr, 0, sizeof(hdr)); memset(tip, 0, sizeof(*tip)); tip->cpu = cpu % cpus_to_use; tip->iterations = def_iterations; tip->ifd = open(file_name, O_RDONLY); if (tip->ifd < 0) { fatal(file_name, ERR_ARGS, "Unable to open\n"); /*NOTREACHED*/ } if (fstat(tip->ifd, &buf) < 0) { fatal(file_name, ERR_SYSCALL, "fstat failed\n"); /*NOTREACHED*/ } if (buf.st_size < (off_t)sizeof(hdr)) { if (verbose) fprintf(stderr, "\t%s empty\n", file_name); goto empty_file; } if (read(tip->ifd, &hdr, sizeof(hdr)) != sizeof(hdr)) { fatal(file_name, ERR_ARGS, "Header read failed\n"); /*NOTREACHED*/ } if (hdr.version != my_version) { fprintf(stderr, "%llx %llx %llx %llx\n", (long long unsigned)hdr.version, (long long unsigned)hdr.genesis, (long long unsigned)hdr.nbunches, (long long unsigned)hdr.total_pkts); fatal(NULL, ERR_ARGS, "BT version mismatch: %lx versus my %lx\n", (long)hdr.version, (long)my_version); } if (hdr.nbunches == 0) { empty_file: close(tip->ifd); free(tip); return; } if (hdr.genesis < genesis) { if (verbose > 1) fprintf(stderr, "Setting genesis to %llu.%llu\n", du64_to_sec(hdr.genesis), du64_to_nsec(hdr.genesis)); genesis = hdr.genesis; } tip->devnm = strdup(devnm); tip->file_name = strdup(file_name); list_add_tail(&tip->head, &input_files); if (verbose) fprintf(stderr, "Added %s %llu\n", file_name, (long long)hdr.genesis); } /** * rem_input_file - Release resources associated with an input file * @tip: Per-input file information */ static void rem_input_file(struct thr_info *tip) { list_del(&tip->head); tip_release(tip); close(tip->ofd); close(tip->ifd); free(tip->file_name); free(tip->devnm); free(tip); } /** * rem_input_files - Remove all input files */ static void rem_input_files(void) { struct list_head *p, *q; list_for_each_safe(p, q, &input_files) { rem_input_file(list_entry(p, struct thr_info, head)); } } /** * __find_input_files - Find input files associated with this device (per cpu) */ static void __find_input_files(struct dev_info *dip) { int cpu = 0; for (;;) { char full_name[MAXPATHLEN]; sprintf(full_name, "%s/%s.%s.%d", idir, dip->devnm, ibase, cpu); if (access(full_name, R_OK) != 0) break; add_input_file(cpu, dip->devnm, full_name); cpu++; } if (!cpu) { fatal(NULL, ERR_ARGS, "No traces found for %s\n", dip->devnm); /*NOTREACHED*/ } rem_input_dev(dip); } /** * find_input_files - Find input files for all devices */ static void find_input_files(void) { struct list_head *p, *q; list_for_each_safe(p, q, &input_devs) { __find_input_files(list_entry(p, struct dev_info, head)); } } /* * ======================================================================== * ==== RECLAIM ROUTINES ================================================== * ======================================================================== */ /** * reap_wait_aios - Wait for and return number of outstanding AIOs * * Will return 0 if we are done */ static int reap_wait_aios(struct thr_info *tip) { int naios = 0; if (!is_reap_done(tip)) { pthread_mutex_lock(&tip->mutex); while (tip->naios_out == 0) { tip->reap_wait = 1; if (pthread_cond_wait(&tip->cond, &tip->mutex)) { fatal("pthread_cond_wait", ERR_SYSCALL, "nfree_current cond wait failed\n"); /*NOTREACHED*/ } } naios = tip->naios_out; pthread_mutex_unlock(&tip->mutex); } assert(is_reap_done(tip) || naios > 0); return is_reap_done(tip) ? 0 : naios; } /** * reclaim_ios - Reclaim AIOs completed, recycle IOCBs * @tip: Per-thread information * @naios_out: Number of AIOs we have outstanding (min) */ static void reclaim_ios(struct thr_info *tip, long naios_out) { long i, ndone; struct io_event *evp, events[naios_out]; again: assert(naios > 0); for (;;) { ndone = io_getevents(tip->ctx, 1, naios_out, events, NULL); if (ndone > 0) break; if (errno && errno != EINTR) { fatal("io_getevents", ERR_SYSCALL, "io_getevents failed\n"); /*NOTREACHED*/ } } assert(0 < ndone && ndone <= naios_out); pthread_mutex_lock(&tip->mutex); for (i = 0, evp = events; i < ndone; i++, evp++) { struct iocb_pkt *iocbp = evp->data; if (evp->res != iocbp->iocb.u.c.nbytes) { fatal(NULL, ERR_SYSCALL, "Event failure %ld/%ld\t(%ld + %ld)\n", (long)evp->res, (long)evp->res2, (long)iocbp->iocb.u.c.offset / nb_sec, (long)iocbp->iocb.u.c.nbytes / nb_sec); /*NOTREACHED*/ } list_move_tail(&iocbp->head, &tip->free_iocbs); } tip->naios_free += ndone; tip->naios_out -= ndone; naios_out = minl(naios_out, tip->naios_out); if (tip->send_wait) { tip->send_wait = 0; pthread_cond_signal(&tip->cond); } pthread_mutex_unlock(&tip->mutex); /* * Short cut: If we /know/ there are some more AIOs, go handle them */ if (naios_out) goto again; } /** * replay_rec - Worker thread to reclaim AIOs * @arg: Pointer to thread information */ static void *replay_rec(void *arg) { long naios_out; struct thr_info *tip = arg; while ((naios_out = reap_wait_aios(tip)) > 0) reclaim_ios(tip, naios_out); assert(tip->send_done); tip->reap_done = 1; set_reclaim_done(); return NULL; } /* * ======================================================================== * ==== REPLAY ROUTINES =================================================== * ======================================================================== */ /** * next_bunch - Retrieve next bunch of AIOs to process * @tip: Per-thread information * @bunch: Bunch information * * Returns TRUE if we recovered a bunch of IOs, else hit EOF */ static int next_bunch(struct thr_info *tip, struct io_bunch *bunch) { size_t count, result; result = read(tip->ifd, &bunch->hdr, sizeof(bunch->hdr)); if (result != sizeof(bunch->hdr)) { if (result == 0) return 0; fatal(tip->file_name, ERR_SYSCALL, "Short hdr(%ld)\n", (long)result); /*NOTREACHED*/ } assert(bunch->hdr.npkts <= BT_MAX_PKTS); count = bunch->hdr.npkts * sizeof(struct io_pkt); result = read(tip->ifd, &bunch->pkts, count); if (result != count) { fatal(tip->file_name, ERR_SYSCALL, "Short pkts(%ld/%ld)\n", (long)result, (long)count); /*NOTREACHED*/ } return 1; } /** * nfree_current - Returns current number of AIOs that are free * * Will wait for available ones... * * Returns 0 if we have some condition that causes us to exit */ static int nfree_current(struct thr_info *tip) { int nfree = 0; pthread_mutex_lock(&tip->mutex); while (!is_send_done(tip) && ((nfree = tip->naios_free) == 0)) { tip->send_wait = 1; if (pthread_cond_wait(&tip->cond, &tip->mutex)) { fatal("pthread_cond_wait", ERR_SYSCALL, "nfree_current cond wait failed\n"); /*NOTREACHED*/ } } pthread_mutex_unlock(&tip->mutex); return nfree; } /** * stall - Stall for the number of nanoseconds requested * * We may be late, in which case we just return. */ static void stall(struct thr_info *tip, long long oclock) { struct timespec req; long long dreal, tclock = gettime() - rgenesis; oclock /= acc_factor; if (verbose > 1) fprintf(tip->vfp, " stall(%lld.%09lld, %lld.%09lld)\n", du64_to_sec(oclock), du64_to_nsec(oclock), du64_to_sec(tclock), du64_to_nsec(tclock)); while (!is_send_done(tip) && tclock < oclock) { dreal = oclock - tclock; req.tv_sec = dreal / (1000 * 1000 * 1000); req.tv_nsec = dreal % (1000 * 1000 * 1000); if (verbose > 1) { fprintf(tip->vfp, "++ stall(%lld.%09lld) ++\n", (long long)req.tv_sec, (long long)req.tv_nsec); } if (nanosleep(&req, NULL) < 0 && signal_done) break; tclock = gettime() - rgenesis; } } /** * iocbs_map - Map a set of AIOs onto a set of IOCBs * @tip: Per-thread information * @list: List of AIOs created * @pkts: AIOs to map * @ntodo: Number of AIOs to map */ static void iocbs_map(struct thr_info *tip, struct iocb **list, struct io_pkt *pkts, int ntodo) { int i; struct io_pkt *pkt; assert(0 < ntodo && ntodo <= naios); pthread_mutex_lock(&tip->mutex); assert(ntodo <= list_len(&tip->free_iocbs)); for (i = 0, pkt = pkts; i < ntodo; i++, pkt++) { __u32 rw = pkt->rw; struct iocb_pkt *iocbp; if (!pkt->rw && !write_enabled) rw = 1; if (verbose > 1) fprintf(tip->vfp, "\t%10llu + %10llu %c%c\n", (unsigned long long)pkt->sector, (unsigned long long)pkt->nbytes / nb_sec, rw ? 'R' : 'W', (rw == 1 && pkt->rw == 0) ? '!' : ' '); iocbp = list_entry(tip->free_iocbs.next, struct iocb_pkt, head); iocb_setup(iocbp, rw, pkt->nbytes, pkt->sector * nb_sec); list_move_tail(&iocbp->head, &tip->used_iocbs); list[i] = &iocbp->iocb; } tip->naios_free -= ntodo; assert(tip->naios_free >= 0); pthread_mutex_unlock(&tip->mutex); } /** * process_bunch - Process a bunch of requests * @tip: Per-thread information * @bunch: Bunch to process */ static void process_bunch(struct thr_info *tip, struct io_bunch *bunch) { __u64 i = 0; struct iocb *list[bunch->hdr.npkts]; assert(0 < bunch->hdr.npkts && bunch->hdr.npkts <= BT_MAX_PKTS); while (!is_send_done(tip) && (i < bunch->hdr.npkts)) { long ndone; int ntodo = min(nfree_current(tip), bunch->hdr.npkts - i); assert(0 < ntodo && ntodo <= naios); iocbs_map(tip, list, &bunch->pkts[i], ntodo); if (!no_stalls) stall(tip, bunch->hdr.time_stamp - genesis); if (ntodo) { if (verbose > 1) fprintf(tip->vfp, "submit(%d)\n", ntodo); ndone = io_submit(tip->ctx, ntodo, list); if (ndone != (long)ntodo) { fatal("io_submit", ERR_SYSCALL, "%d: io_submit(%d:%ld) failed (%s)\n", tip->cpu, ntodo, ndone, strerror(labs(ndone))); /*NOTREACHED*/ } pthread_mutex_lock(&tip->mutex); tip->naios_out += ndone; assert(tip->naios_out <= naios); if (tip->reap_wait) { tip->reap_wait = 0; pthread_cond_signal(&tip->cond); } pthread_mutex_unlock(&tip->mutex); i += ndone; assert(i <= bunch->hdr.npkts); } } } /** * reset_input_file - Reset the input file for the next iteration * @tip: Thread information * * We also do a dummy read of the file header to get us to the first bunch. */ static void reset_input_file(struct thr_info *tip) { struct io_file_hdr hdr; lseek(tip->ifd, 0, 0); if (read(tip->ifd, &hdr, sizeof(hdr)) != sizeof(hdr)) { fatal(tip->file_name, ERR_ARGS, "Header reread failed\n"); /*NOTREACHED*/ } } /** * replay_sub - Worker thread to submit AIOs that are being replayed */ static void *replay_sub(void *arg) { unsigned int i; char *mdev; char path[MAXPATHLEN]; struct io_bunch bunch; struct thr_info *tip = arg; int oflags; pin_to_cpu(tip); mdev = map_dev(tip->devnm); sprintf(path, "/dev/%s", mdev); /* * convert underscores to slashes to * restore device names that have larger paths */ for (i = 0; i < strlen(mdev); i++) if (path[strlen("/dev/") + i] == '_') path[strlen("/dev/") + i] = '/'; #ifdef O_NOATIME oflags = O_NOATIME; #else oflags = 0; #endif tip->ofd = open(path, O_RDWR | O_DIRECT | oflags); if (tip->ofd < 0) { fatal(path, ERR_SYSCALL, "Failed device open\n"); /*NOTREACHED*/ } set_replay_ready(); while (!is_send_done(tip) && tip->iterations--) { wait_iter_start(); if (verbose > 1) fprintf(tip->vfp, "\n=== %d ===\n", tip->iterations); while (!is_send_done(tip) && next_bunch(tip, &bunch)) process_bunch(tip, &bunch); set_iter_done(); reset_input_file(tip); } tip->send_done = 1; set_replay_done(); return NULL; } /* * ======================================================================== * ==== COMMAND LINE ARGUMENT HANDLING ==================================== * ======================================================================== */ static char usage_str[] = \ "\n" \ "\t[ -c <cpus> : --cpus=<cpus> ] Default: 1\n" \ "\t[ -d <dir> : --input-directory=<dir> ] Default: .\n" \ "\t[ -F : --find-records ] Default: Off\n" \ "\t[ -h : --help ] Default: Off\n" \ "\t[ -i <base> : --input-base=<base> ] Default: replay\n" \ "\t[ -I <iters>: --iterations=<iters> ] Default: 1\n" \ "\t[ -M <file> : --map-devs=<file> ] Default: None\n" \ "\t[ -N : --no-stalls ] Default: Off\n" \ "\t[ -x : --acc-factor ] Default: 1\n" \ "\t[ -v : --verbose ] Default: Off\n" \ "\t[ -V : --version ] Default: Off\n" \ "\t[ -W : --write-enable ] Default: Off\n" \ "\t<dev...> Default: None\n" \ "\n"; #define S_OPTS "c:d:Fhi:I:M:Nx:t:vVW" static struct option l_opts[] = { { .name = "cpus", .has_arg = required_argument, .flag = NULL, .val = 'c' }, { .name = "input-directory", .has_arg = required_argument, .flag = NULL, .val = 'd' }, { .name = "find-records", .has_arg = no_argument, .flag = NULL, .val = 'F' }, { .name = "help", .has_arg = no_argument, .flag = NULL, .val = 'h' }, { .name = "input-base", .has_arg = required_argument, .flag = NULL, .val = 'i' }, { .name = "iterations", .has_arg = required_argument, .flag = NULL, .val = 'I' }, { .name = "map-devs", .has_arg = required_argument, .flag = NULL, .val = 'M' }, { .name = "no-stalls", .has_arg = no_argument, .flag = NULL, .val = 'N' }, { .name = "acc-factor", .has_arg = required_argument, .flag = NULL, .val = 'x' }, { .name = "verbose", .has_arg = no_argument, .flag = NULL, .val = 'v' }, { .name = "version", .has_arg = no_argument, .flag = NULL, .val = 'V' }, { .name = "write-enable", .has_arg = no_argument, .flag = NULL, .val = 'W' }, { .name = NULL } }; /** * handle_args: Parse passed in argument list * @argc: Number of arguments in argv * @argv: Arguments passed in * * Does rudimentary parameter verification as well. */ static void handle_args(int argc, char *argv[]) { int c; int r; while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) { switch (c) { case 'c': cpus_to_use = atoi(optarg); if (cpus_to_use <= 0 || cpus_to_use > ncpus) { fatal(NULL, ERR_ARGS, "Invalid number of cpus %d (0<x<%d)\n", cpus_to_use, ncpus); /*NOTREACHED*/ } break; case 'd': idir = optarg; if (access(idir, R_OK | X_OK) != 0) { fatal(idir, ERR_ARGS, "Invalid input directory specified\n"); /*NOTREACHED*/ } break; case 'F': find_records = 1; break; case 'h': usage(); exit(0); /*NOTREACHED*/ case 'i': ibase = optarg; break; case 'I': def_iterations = atoi(optarg); if (def_iterations <= 0) { fprintf(stderr, "Invalid number of iterations %d\n", def_iterations); exit(ERR_ARGS); /*NOTREACHED*/ } break; case 'M': read_map_devs(optarg); break; case 'N': no_stalls = 1; break; case 'x': r = sscanf(optarg,"%u",&acc_factor); if (r!=1) { fprintf(stderr, "Invalid acceleration factor\n"); exit(ERR_ARGS); /*NOTREACHED*/ } break; case 'V': fprintf(stderr, "btreplay -- version %s\n", my_btversion); exit(0); /*NOTREACHED*/ case 'v': verbose++; break; case 'W': write_enabled = 1; break; default: usage(); fatal(NULL, ERR_ARGS, "Invalid command line argument %c\n", c); /*NOTREACHED*/ } } while (optind < argc) add_input_dev(argv[optind++]); if (find_records) find_input_devs(idir); if (list_len(&input_devs) == 0) { fatal(NULL, ERR_ARGS, "Missing required input dev name(s)\n"); /*NOTREACHED*/ } if (cpus_to_use < 0) cpus_to_use = ncpus; } /* * ======================================================================== * ==== MAIN ROUTINE ====================================================== * ======================================================================== */ /** * set_signal_done - Signal handler, catches signals & sets signal_done */ static void set_signal_done(__attribute__((__unused__))int signum) { signal_done = 1; } /** * main - * @argc: Number of arguments * @argv: Array of arguments */ int main(int argc, char *argv[]) { int i; struct list_head *p; pgsize = getpagesize(); assert(pgsize > 0); setup_signal(SIGINT, set_signal_done); setup_signal(SIGTERM, set_signal_done); get_ncpus(); handle_args(argc, argv); find_input_files(); nfiles = list_len(&input_files); __list_for_each(p, &input_files) { tip_init(list_entry(p, struct thr_info, head)); } wait_replays_ready(); for (i = 0; i < def_iterations; i++) { rgenesis = gettime(); start_iter(); if (verbose) fprintf(stderr, "I"); wait_iters_done(); } wait_replays_done(); wait_reclaims_done(); if (verbose) fprintf(stderr, "\n"); rem_input_files(); release_map_devs(); return 0; }