char   netcpu_looper_id[]="\
@(#)netcpu_looper.c (c) Copyright 2005-2012. Version 2.6.0";

/* netcpu_looper.c

   Implement the soaker process specific portions of netperf CPU
   utilization measurements. These are broken-out into a separate file
   to make life much nicer over in netlib.c which had become a maze of
   twisty, CPU-util-related, #ifdefs, all different.  raj 2005-01-26
   */

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <stdio.h>

#ifdef HAVE_FCNTL_H
# include <fcntl.h>
#endif
#if HAVE_UNISTD_H
# include <unistd.h>
#endif
#if defined(HAVE_MMAP) || defined(HAVE_SYS_MMAN_H)
# include <sys/mman.h>
#else
# error netcpu_looper requires mmap
#endif

#if TIME_WITH_SYS_TIME
# include <sys/time.h>
# include <time.h>
#else
# if HAVE_SYS_TIME_H
#  include <sys/time.h>
# else
#  include <time.h>
# endif
#endif

#if HAVE_SYS_TYPES_H
# include <sys/types.h>
#endif

#if HAVE_SYS_WAIT_H
# include <sys/wait.h>
#endif

#ifdef HAVE_SIGNAL_H
#include <signal.h>
#endif

#ifdef HAVE_ERRNO_H
#include <errno.h>
#endif

#include "netsh.h"
#include "netlib.h"

#define PAGES_PER_CHILD 2

/* the lib_start_count and lib_end_count arrays hold the starting
   and ending values of whatever is counting when the system is
   idle. The rate at which this increments during a test is compared
   with a previous calibrarion to arrive at a CPU utilization
   percentage. raj 2005-01-26 */
static uint64_t  lib_start_count[MAXCPUS];
static uint64_t  lib_end_count[MAXCPUS];

static int *cpu_mappings;

static int lib_idle_fd;
static uint64_t *lib_idle_address[MAXCPUS];
static long     *lib_base_pointer;
static pid_t     lib_idle_pids[MAXCPUS];
static int       lib_loopers_running=0;

/* we used to use this code to bind the loopers, but since we have
   decided to enable processor affinity for the actual
   netperf/netserver processes we will use that affinity routine,
   which happens to know about more systems than this */

#ifdef NOTDEF
static void
bind_to_processor(int child_num)
{
  /* This routine will bind the calling process to a particular */
  /* processor. We are not choosy as to which processor, so it will be */
  /* the process id mod the number of processors - shifted by one for */
  /* those systems which name processor starting from one instead of */
  /* zero. on those systems where I do not yet know how to bind a */
  /* process to a processor, this routine will be a no-op raj 10/95 */

  /* just as a reminder, this is *only* for the looper processes, not */
  /* the actual measurement processes. those will, should, MUST float */
  /* or not float from CPU to CPU as controlled by the operating */
  /* system defaults. raj 12/95 */

#ifdef __hpux
#include <sys/syscall.h>
#include <sys/mp.h>

  int old_cpu = -2;

  if (debug) {
    fprintf(where,
            "child %d asking for CPU %d as pid %d with %d CPUs\n",
            child_num,
            (child_num % lib_num_loc_cpus),
            getpid(),
            lib_num_loc_cpus);
    fflush(where);
  }

  SETPROCESS((child_num % lib_num_loc_cpus), getpid());
  return;

#else
#if defined(__sun) && defined(__SVR4)
 /* should only be Solaris */
#include <sys/processor.h>
#include <sys/procset.h>

  int old_binding;

  if (debug) {
    fprintf(where,
            "bind_to_processor: child %d asking for CPU %d as pid %d with %d CPUs\n",
            child_num,
            (child_num % lib_num_loc_cpus),
            getpid(),
            lib_num_loc_cpus);
    fflush(where);
  }

  if (processor_bind(P_PID,
                     getpid(),
                     (child_num % lib_num_loc_cpus),
                      &old_binding) != 0) {
    fprintf(where,"bind_to_processor: unable to perform processor binding\n");
    fprintf(where,"                   errno %d\n",errno);
    fflush(where);
  }
  return;
#else
#ifdef WIN32

  if (!SetThreadAffinityMask(GetCurrentThread(), (ULONG_PTR)1 << (child_num % lib_num_loc_cpus))) {
    perror("SetThreadAffinityMask failed");
    fflush(stderr);
  }

  if (debug) {
    fprintf(where,
            "bind_to_processor: child %d asking for CPU %d of %d CPUs\n",
            child_num,
            (child_num % lib_num_loc_cpus),
            lib_num_loc_cpus);
    fflush(where);
  }

#endif
  return;
#endif /* __sun && _SVR4 */
#endif /* __hpux */
}
#endif

 /* sit_and_spin will just spin about incrementing a value */
 /* this value will either be in a memory mapped region on Unix shared */
 /* by each looper process, or something appropriate on Windows/NT */
 /* (malloc'd or such). This routine is reasonably ugly in that it has */
 /* priority manipulating code for lots of different operating */
 /* systems. This routine never returns. raj 1/96 */

static void
sit_and_spin(int child_index)

{
  uint64_t *my_counter_ptr;

 /* only use C stuff if we are not WIN32 unless and until we */
 /* switch from CreateThread to _beginthread. raj 1/96 */
#ifndef WIN32
  /* we are the child. we could decide to exec some separate */
  /* program, but that doesn't really seem worthwhile - raj 4/95 */
  if (debug > 1) {
    fprintf(where,
            "Looper child %d is born, pid %d\n",
            child_index,
            getpid());
    fflush(where);
  }

#endif /* WIN32 */

  /* reset our base pointer to be at the appropriate offset */
  my_counter_ptr = (uint64_t *) ((char *)lib_base_pointer +
                             (netlib_get_page_size() *
                              PAGES_PER_CHILD * child_index));

  /* in the event we are running on an MP system, it would */
  /* probably be good to bind the soaker processes to specific */
  /* processors. I *think* this is the most reasonable thing to */
  /* do, and would be closes to simulating the information we get */
  /* on HP-UX with pstat. I could put all the system-specific code */
  /* here, but will "abstract it into another routine to keep this */
  /* area more readable. I'll probably do the same thine with the */
  /* "low pri code" raj 10/95 */

  /* since we are "flying blind" wrt where we should bind the looper
     processes, we want to use the cpu_map that was prepared by netlib
     rather than assume that the CPU ids on the system start at zero
     and are contiguous. raj 2006-04-03 */
  bind_to_specific_processor(child_index % lib_num_loc_cpus,1);

  for (*my_counter_ptr = 0L;
       ;
       (*my_counter_ptr)++) {
    if (!(*lib_base_pointer % 1)) {
      /* every once and again, make sure that our process priority is */
      /* nice and low. also, by making system calls, it may be easier */
      /* for us to be pre-empted by something that needs to do useful */
      /* work - like the thread of execution actually sending and */
      /* receiving data across the network :) */
#ifdef _AIX
      int pid,prio;

      prio = PRIORITY;
      pid = getpid();
      /* if you are not root, this call will return EPERM - why one */
      /* cannot change one's own priority to  lower value is beyond */
      /* me. raj 2/26/96 */
      setpri(pid, prio);
#else /* _AIX */
#ifdef __sgi
      int pid,prio;

      prio = PRIORITY;
      pid = getpid();
      schedctl(NDPRI, pid, prio);
      sginap(0);
#else /* __sgi */
#ifdef WIN32
      SetThreadPriority(GetCurrentThread(),THREAD_PRIORITY_IDLE);
#else /* WIN32 */
#if defined(__sun) && defined(__SVR4)
#include <sys/types.h>
#include <sys/priocntl.h>
#include <sys/rtpriocntl.h>
#include <sys/tspriocntl.h>
      /* I would *really* like to know how to use priocntl to make the */
      /* priority low for this looper process. however, either my mind */
      /* is addled, or the manpage in section two for priocntl is not */
      /* terribly helpful - for one, it has no examples :( so, if you */
      /* can help, I'd love to hear from you. in the meantime, we will */
      /* rely on nice(39). raj 2/26/96 */
      nice(39);
#else /* __sun && __SVR4 */
      nice(39);
#endif /* __sun && _SVR4 */
#endif /* WIN32 */
#endif /* __sgi */
#endif /* _AIX */
    }
  }
}



 /* this routine will start all the looper processes or threads for */
 /* measuring CPU utilization. */

static void
start_looper_processes()
{

  unsigned int      i, file_size;

  /* we want at least two pages for each processor. the */
  /* child for any one processor will write to the first of his two */
  /* pages, and the second page will be a buffer in case there is page */
  /* prefetching. if your system pre-fetches more than a single page, */
  /* well, you'll have to modify this or live with it :( raj 4/95 */

  file_size = ((netlib_get_page_size() * PAGES_PER_CHILD) *
               lib_num_loc_cpus);

#ifndef WIN32

  /* we we are not using WINDOWS NT (or 95 actually :), then we want */
  /* to create a memory mapped region so we can see all the counting */
  /* rates of the loopers */

  /* could we just use an anonymous memory region for this? it is */
  /* possible that using a mmap()'ed "real" file, while convenient for */
  /* debugging, could result in some filesystem activity - like */
  /* metadata updates? raj 4/96 */
  lib_idle_fd = open("/tmp/netperf_cpu",O_RDWR | O_CREAT | O_EXCL);

  if (lib_idle_fd == -1) {
    fprintf(where,"create_looper: file creation; errno %d\n",errno);
    fflush(where);
    exit(1);
  }

  if (chmod("/tmp/netperf_cpu",0644) == -1) {
    fprintf(where,"create_looper: chmod; errno %d\n",errno);
    fflush(where);
    exit(1);
  }

  /* with the file descriptor in place, lets be sure that the file is */
  /* large enough. */

  if (truncate("/tmp/netperf_cpu",file_size) == -1) {
    fprintf(where,"create_looper: truncate: errno %d\n",errno);
    fflush(where);
    exit(1);
  }

  /* the file should be large enough now, so we can mmap it */

  /* if the system does not have MAP_VARIABLE, just define it to */
  /* be zero. it is only used/needed on HP-UX (?) raj 4/95 */
#ifndef MAP_VARIABLE
#define MAP_VARIABLE 0x0000
#endif /* MAP_VARIABLE */
#ifndef MAP_FILE
#define MAP_FILE 0x0000
#endif /* MAP_FILE */
  if ((lib_base_pointer = (long *)mmap(NULL,
                                       file_size,
                                       PROT_READ | PROT_WRITE,
                                       MAP_FILE | MAP_SHARED | MAP_VARIABLE,
                                       lib_idle_fd,
                                       0)) == (long *)-1) {
    fprintf(where,"create_looper: mmap: errno %d\n",errno);
    fflush(where);
    exit(1);
  }


  if (debug > 1) {
    fprintf(where,"num CPUs %d, file_size %d, lib_base_pointer %p\n",
            lib_num_loc_cpus,
            file_size,
            lib_base_pointer);
    fflush(where);
  }

  /* we should have a valid base pointer. lets fork */

  for (i = 0; i < (unsigned int)lib_num_loc_cpus; i++) {
    switch (lib_idle_pids[i] = fork()) {
    case -1:
      perror("netperf: fork");
      exit(1);
    case 0:
      /* we are the child. we could decide to exec some separate */
      /* program, but that doesn't really seem worthwhile - raj 4/95 */

      signal(SIGTERM, SIG_DFL);
      sit_and_spin(i);

      /* we should never really get here, but if we do, just exit(0) */
      exit(0);
      break;
    default:
      /* we must be the parent */
      lib_idle_address[i] = (uint64_t *) ((char *)lib_base_pointer +
                                      (netlib_get_page_size() *
                                       PAGES_PER_CHILD * i));
      if (debug) {
        fprintf(where,"lib_idle_address[%d] is %p\n",
                i,
                lib_idle_address[i]);
        fflush(where);
      }
    }
  }
#else
  /* we are compiled -DWIN32 */
  if ((lib_base_pointer = malloc(file_size)) == NULL) {
    fprintf(where,
            "create_looper_process could not malloc %d bytes\n",
            file_size);
    fflush(where);
    exit(1);
  }

  /* now, create all the threads */
  for(i = 0; i < (unsigned int)lib_num_loc_cpus; i++) {
    long place_holder;
    if ((lib_idle_pids[i] = CreateThread(0,
                                         0,
                                         (LPTHREAD_START_ROUTINE)sit_and_spin,
                                         (LPVOID)(ULONG_PTR)i,
                                         0,
                                         &place_holder)) == NULL ) {
      fprintf(where,
              "create_looper_process: CreateThread failed\n");
      fflush(where);
      /* I wonder if I need to look for other threads to kill? */
      exit(1);
    }
    lib_idle_address[i] = (long *) ((char *)lib_base_pointer +
                                    (netlib_get_page_size() *
                                     PAGES_PER_CHILD * i));
    if (debug) {
      fprintf(where,"lib_idle_address[%d] is %p\n",
              i,
              lib_idle_address[i]);
      fflush(where);
    }
  }
#endif /* WIN32 */

  /* we need to have the looper processes settled-in before we do */
  /* anything with them, so lets sleep for say 30 seconds. raj 4/95 */

  sleep(30);
}

void
cpu_util_init(void)
{
  cpu_method = LOOPER;

  /* we want to get the looper processes going */
  if (!lib_loopers_running) {
    start_looper_processes();
    lib_loopers_running = 1;
  }

  return;
}

/* clean-up any left-over CPU util resources - looper processes,
   files, whatever.  raj 2005-01-26 */
void
cpu_util_terminate() {

#ifdef WIN32
  /* it would seem that if/when the process exits, all the threads */
  /* will go away too, so I don't think I need any explicit thread */
  /* killing calls here. raj 1/96 */
#else

  int i;

  /* now go through and kill-off all the child processes */
  for (i = 0; i < lib_num_loc_cpus; i++){
    /* SIGKILL can leave core files behind - thanks to Steinar Haug */
    /* for pointing that out. */
    kill(lib_idle_pids[i],SIGTERM);
  }
  lib_loopers_running = 0;
  /* reap the children */
  while(waitpid(-1, NULL, WNOHANG) > 0) { }

  /* finally, unlink the mmaped file */
  munmap((caddr_t)lib_base_pointer,
         ((netlib_get_page_size() * PAGES_PER_CHILD) *
          lib_num_loc_cpus));
  unlink("/tmp/netperf_cpu");
#endif
  return;
}

int
get_cpu_method(void)
{
  return LOOPER;
}

 /* calibrate_looper */

 /* Loop a number of iterations, sleeping interval seconds each and */
 /* count how high the idle counter gets each time. Return  the */
 /* measured cpu rate to the calling routine. raj 4/95 */

float
calibrate_idle_rate (int iterations, int interval)
{

  uint64_t
    firstcnt[MAXCPUS],
    secondcnt[MAXCPUS];

  float
    elapsed,
    temp_rate,
    rate[MAXTIMES],
    local_maxrate;

  long
    sec,
    usec;

  int
    i,
    j;

  struct  timeval time1, time2 ;
  struct  timezone tz;

  if (iterations > MAXTIMES) {
    iterations = MAXTIMES;
  }

  local_maxrate = (float)-1.0;

  for(i = 0; i < iterations; i++) {
    rate[i] = (float)0.0;
    for (j = 0; j < lib_num_loc_cpus; j++) {
      firstcnt[j] = *(lib_idle_address[j]);
    }
    gettimeofday (&time1, &tz);
    sleep(interval);
    gettimeofday (&time2, &tz);

    if (time2.tv_usec < time1.tv_usec)
      {
        time2.tv_usec += 1000000;
        time2.tv_sec -=1;
      }
    sec = time2.tv_sec - time1.tv_sec;
    usec = time2.tv_usec - time1.tv_usec;
    elapsed = (float)sec + ((float)usec/(float)1000000.0);

    if(debug) {
      fprintf(where, "Calibration for counter run: %d\n",i);
      fprintf(where,"\tsec = %ld usec = %ld\n",sec,usec);
      fprintf(where,"\telapsed time = %g\n",elapsed);
    }

    for (j = 0; j < lib_num_loc_cpus; j++) {
      secondcnt[j] = *(lib_idle_address[j]);
      if(debug) {
        /* I know that there are situations where compilers know about */
        /* long long, but the library fucntions do not... raj 4/95 */
        fprintf(where,
                "\tfirstcnt[%d] = 0x%8.8lx%8.8lx secondcnt[%d] = 0x%8.8lx%8.8lx\n",
                j,
                (uint32_t)(firstcnt[j]>>32),
                (uint32_t)(firstcnt[j]&0xffffffff),
                j,
                (uint32_t)(secondcnt[j]>>32),
                (uint32_t)(secondcnt[j]&0xffffffff));
      }
      /* we assume that it would wrap no more than once. we also */
      /* assume that the result of subtracting will "fit" raj 4/95 */
      temp_rate = (secondcnt[j] >= firstcnt[j]) ?
        (float)(secondcnt[j] - firstcnt[j])/elapsed :
          (float)(secondcnt[j]-firstcnt[j]+MAXLONG)/elapsed;
      if (temp_rate > rate[i]) rate[i] = temp_rate;
      if(debug) {
        fprintf(where,"\trate[%d] = %g\n",i,rate[i]);
        fflush(where);
      }
      if (local_maxrate < rate[i]) local_maxrate = rate[i];
    }
  }
  if(debug) {
    fprintf(where,"\tlocal maxrate = %g per sec. \n",local_maxrate);
    fflush(where);
  }
  return local_maxrate;
}


static void
get_cpu_idle (uint64_t *res)
{
  int i;

  for (i = 0; i < lib_num_loc_cpus; i++){
    res[i] = *lib_idle_address[i];
  }

}

float
calc_cpu_util_internal(float elapsed_time)
{
  int i;
  float correction_factor;
  float actual_rate;

  memset(&lib_local_cpu_stats, 0, sizeof(lib_local_cpu_stats));

  /* It is possible that the library measured a time other than */
  /* the one that the user want for the cpu utilization */
  /* calculations - for example, tests that were ended by */
  /* watchdog timers such as the udp stream test. We let these */
  /* tests tell up what the elapsed time should be. */

  if (elapsed_time != 0.0) {
    correction_factor = (float) 1.0 +
      ((lib_elapsed - elapsed_time) / elapsed_time);
  }
  else {
    correction_factor = (float) 1.0;
  }

  for (i = 0; i < lib_num_loc_cpus; i++) {

    /* it would appear that on some systems, in loopback, nice is
     *very* effective, causing the looper process to stop dead in its
     tracks. if this happens, we need to ensure that the calculation
     does not go south. raj 6/95 and if we run completely out of idle,
     the same thing could in theory happen to the USE_KSTAT path. raj
     8/2000 */

    if (lib_end_count[i] == lib_start_count[i]) {
      lib_end_count[i]++;
    }

    actual_rate = (lib_end_count[i] > lib_start_count[i]) ?
      (float)(lib_end_count[i] - lib_start_count[i])/lib_elapsed :
      (float)(lib_end_count[i] - lib_start_count[i] +
	      MAXLONG)/ lib_elapsed;
    if (debug) {
      fprintf(where,
              "calc_cpu_util: actual_rate on processor %d is %f start 0x%8.8lx%8.8lx end 0x%8.8lx%8.8lx\n",
              i,
              actual_rate,
              (uint32_t)(lib_start_count[i]>>32),
              (uint32_t)(lib_start_count[i]&0xffffffff),
              (uint32_t)(lib_end_count[i]>>32),
              (uint32_t)(lib_end_count[i]&0xffffffff));
    }
    lib_local_per_cpu_util[i] = (lib_local_maxrate - actual_rate) /
      lib_local_maxrate * 100;
    lib_local_per_cpu_util[i] *= correction_factor;
    lib_local_cpu_stats.cpu_util += lib_local_per_cpu_util[i];
  }
  /* we want the average across all n processors */
  lib_local_cpu_stats.cpu_util /= (float)lib_num_loc_cpus;

  return lib_local_cpu_stats.cpu_util;
}

void
cpu_start_internal(void)
{
  get_cpu_idle(lib_start_count);
  return;
}

void
cpu_stop_internal(void)
{
  get_cpu_idle(lib_end_count);
}