/**
 * @file op_pmu.c
 * Setup and handling of IA64 Performance Monitoring Unit (PMU)
 *
 * @remark Copyright 2002 OProfile authors
 * @remark Read the file COPYING
 *
 * @author Bob Montgomery
 * @author Will Cohen
 * @author John Levon
 * @author Philippe Elie
 */


#include "oprofile.h"
#include "op_util.h"
#include <asm/perfmon.h>
#include "op_ia64_model.h"

/* number of counters physically present */
static uint op_nr_counters = 4;

/* performance counters are in pairs: pmcN and pmdN.  The pmc register acts
 * as the event selection; the pmd register is the counter. */
#define perf_reg(c)	((c)+4)

#define IA64_1_PMD_MASK_VAL	((1UL << 32) - 1)
#define IA64_2_PMD_MASK_VAL	((1UL << 47) - 1)

/* The appropriate value is selected in pmu_init() */
unsigned long pmd_mask = IA64_2_PMD_MASK_VAL;

#define pmd_overflowed(r, c) ((r) & (1 << perf_reg(c)))
#define set_pmd_neg(v, c) do { \
	ia64_set_pmd(perf_reg(c), -(ulong)(v) & pmd_mask); \
	ia64_srlz_d(); } while (0)
#define set_pmd(v, c) do { \
	ia64_set_pmd(perf_reg(c), (v) & pmd_mask); \
	ia64_srlz_d(); } while (0)
#define set_pmc(v, c) do { ia64_set_pmc(perf_reg(c), (v)); ia64_srlz_d(); } while (0)
#define get_pmd(c) ia64_get_pmd(perf_reg(c))
#define get_pmc(c) ia64_get_pmc(perf_reg(c))

/* ---------------- IRQ handler ------------------ */

/* The args match the args for pfm_overflow_handler in perfmon.c.
 * The task_struct is currently filled in with the perfmon "owner" of
 * the PMU.  This might change.  I'm not sure it makes sense in perfmon
 * either with system-wide profiling.
 * pmc0 is a bit mask for overflowed counters (bits 4-7)
 * This routine should return 0 to resume interrupts.
 */
inline static void
op_do_pmu_interrupt(u64 pmc0, struct pt_regs * regs)
{
	uint cpu = op_cpu_id();
	int ctr;

	for (ctr = 0 ; ctr < op_nr_counters ; ++ctr) {
		if (pmd_overflowed(pmc0, ctr)) {
			op_do_profile(cpu, regs->cr_iip, 1, ctr);
			set_pmd_neg(oprof_data[cpu].ctr_count[ctr], ctr);
		}
	}
	return;
}


static void
op_raw_pmu_interrupt(int irq, void * arg, struct pt_regs * regs)
{
	u64 pmc0;

	pmc0 = ia64_get_pmc(0);

	if ((pmc0 & ~0x1UL) != 0UL) {
		op_do_pmu_interrupt(pmc0, regs);
		ia64_set_pmc(0, 0);
		ia64_srlz_d();
	}
}


#define MY_OPROFILE_VECTOR (IA64_PERFMON_VECTOR - 2)

static void
op_set_pmv(void * dummy)
{
	ia64_set_pmv(MY_OPROFILE_VECTOR);
	ia64_srlz_d();
}


static void
op_restore_pmv(void* dummy)
{
	ia64_set_pmv(IA64_PERFMON_VECTOR);
	ia64_srlz_d();
}


static int
install_handler(void)
{
	int err = 0;

	/* Try it legally - confusion about vec vs irq */
	err = request_irq(MY_OPROFILE_VECTOR, op_raw_pmu_interrupt, 
			SA_INTERRUPT | SA_PERCPU_IRQ, "oprofile", NULL);

	if (err) {
		printk(KERN_ALERT "oprofile_IA64: request_irq fails, "
				"returns %d\n", err);
		return err;
	}

	if ((smp_call_function(op_set_pmv, NULL, 0, 1))) {
		printk(KERN_ALERT "oprofile_IA64: unexpected failure "
				"of smp_call_function(op_set_pmv)\n");
	}

	op_set_pmv(NULL);

	return err;
}


static int
restore_handler(void)
{
	int err = 0;

	if ((smp_call_function(op_restore_pmv, NULL, 0, 1))) {
		printk(KERN_ALERT "oprofile_IA64: unexpected failure "
				"of smp_call_function(op_restore_pmv)\n");
	}

	op_restore_pmv(NULL);

	free_irq(MY_OPROFILE_VECTOR, NULL);
	return err;
}


/* ---------------- PMU setup ------------------ */

/* This is kind of artificial.  The proc interface might really want to
 * accept register values directly.  There are other features not exposed 
 * by this limited interface.  Of course that might require all sorts of
 * validity checking??? */
static void
pmc_fill_in(ulong * val, u8 kernel, u8 user, u8 event, u8 um)
{
	/* enable interrupt generation */
	*val |= (1 << 5);

	/* setup as a privileged monitor */
	*val |= (1 << 6);

	/* McKinley requires pmc4 to have bit 23 set (enable PMU).
	 * It is supposedly ignored in other pmc registers.
	 * Try assuming it's ignored in Itanium, too, and just
	 * set it for everyone.
	 */

	*val |= (1 << 23);

	/* enable/disable chosen OS and USR counting */
	(user)   ? (*val |= (1 << 3))
		 : (*val &= ~(1 << 3));

	(kernel) ? (*val |= (1 << 0))
		 : (*val &= ~(1 << 0));

	/* what are we counting ? */
	*val &= ~(0xff << 8);
	*val |= ((event & 0xff) << 8);
	*val &= ~(0xf << 16);
	*val |= ((um & 0xf) << 16);
}


static void
pmu_setup(void * dummy)
{
	ulong pmc_val;
	int ii;

	/* setup each counter */
	for (ii = 0 ; ii < op_nr_counters ; ++ii) {
		if (sysctl.ctr[ii].enabled) {
			pmc_val = 0;

			set_pmd_neg(sysctl.ctr[ii].count, ii);
			pmc_fill_in(&pmc_val, sysctl.ctr[ii].kernel, 
				sysctl.ctr[ii].user, sysctl.ctr[ii].event, 
				sysctl.ctr[ii].unit_mask);

			set_pmc(pmc_val, ii);
		}
	}
}


void 
disable_psr(void * dummy)
{
	struct pt_regs * regs;
	/* disable profiling for my saved state */
	regs = (struct pt_regs *)((unsigned long) current + IA64_STK_OFFSET);
	regs--;
	ia64_psr(regs)->pp = 0;
	/* shouldn't need to */
	ia64_psr(regs)->up = 0;

	/* disable profiling for my current state */
	__asm__ __volatile__ ("rsm psr.pp;;"::: "memory");

#if defined(CONFIG_PERFMON) && defined(CONFIG_SMP)
#if V_AT_LEAST(2, 4, 21)
	local_cpu_data->pfm_syst_info |=  PFM_CPUINFO_SYST_WIDE;
	local_cpu_data->pfm_syst_info &= ~PFM_CPUINFO_DCR_PP;
	/* FIXME: what todo with the 3rd flags PFM_CPUINFO_EXCL_IDLE 0x4 */
#else
	/* disable profiling for everyone else */
	local_cpu_data->pfm_syst_wide = 1;
	local_cpu_data->pfm_dcr_pp = 0;
#endif
#endif
	ia64_set_pmc(0, 0);
	ia64_srlz_d();
}


static int
pmu_setup_all(void)
{

	/* This would be a great place to reserve all cpus with 
	 * some sort of call to perfmonctl (something like the
	 * CREATE_CONTEXT command).  The current interface to 
	 * perfmonctl wants to be called from a different task id
	 * for each CPU to be set up (and doesn't allow calls from
	 * modules.
	 */

	/* disable profiling with the psr.pp bit */
	if ((smp_call_function(disable_psr, NULL, 0, 1)))
		return -EFAULT;

	disable_psr(NULL);

	/* now I've reserved the PMUs and they should be quiet */

	if ((smp_call_function(pmu_setup, NULL, 0, 1)))
		return -EFAULT;

	pmu_setup(NULL);
	return 0;
}


#ifndef CONFIG_SMP
/* from linux/arch/ia64/kernel/perfmon.c */
/*
 * Originaly Written by Ganesh Venkitachalam, IBM Corp.
 * Copyright (C) 1999 Ganesh Venkitachalam <venkitac@us.ibm.com>
 *
 * Modifications by Stephane Eranian, Hewlett-Packard Co.
 * Modifications by David Mosberger-Tang, Hewlett-Packard Co.
 *
 * Copyright (C) 1999-2002  Hewlett Packard Co
 *               Stephane Eranian <eranian@hpl.hp.com>
 *               David Mosberger-Tang <davidm@hpl.hp.com>
 */

/*
 * On UP kernels, we do not need to constantly set the psr.pp bit
 * when a task is scheduled. The psr.pp bit can only be changed in
 * the kernel because of a user request. Given we are on a UP non preeemptive 
 * kernel we know that no other task is running, so we cna simply update their
 * psr.pp from their saved state. There is this no impact on the context switch
 * code compared to the SMP case.
 */
static void
op_tasklist_toggle_pp(unsigned int val)
{
	struct task_struct * p;
	struct pt_regs * regs;

	read_lock(&tasklist_lock);

	for_each_task(p) {
		regs = (struct pt_regs *)((unsigned long) p + IA64_STK_OFFSET);

		/*
		 * position on pt_regs saved on stack on 1st entry into the kernel
		 */
		regs--;

		/*
		 * update psr.pp
		 */
		ia64_psr(regs)->pp = val;
	}
	read_unlock(&tasklist_lock);
}
#endif


static void
pmu_start(void * info)
{
	struct pt_regs * regs;

	if (info && (*((uint *)info) != op_cpu_id()))
		return;

	/* printk(KERN_ALERT "oprofile_IA64: pmu_start on cpu %d\n", 
	  	op_cpu_id()); */
	/* The default control register pp value is copied into psr.pp
	 * on an interrupt.  This allows interrupt service routines to
	 * be monitored.
	 */
	ia64_set_dcr(ia64_get_dcr() | IA64_DCR_PP);

#ifdef CONFIG_PERFMON
#ifdef CONFIG_SMP
#if V_AT_LEAST(2, 4, 21)
	local_cpu_data->pfm_syst_info |= PFM_CPUINFO_SYST_WIDE;
	local_cpu_data->pfm_syst_info |= PFM_CPUINFO_DCR_PP;
	/* FIXME: what todo with the 3rd flags PFM_CPUINFO_EXCL_IDLE 0x4 */
#else
	local_cpu_data->pfm_syst_wide = 1;
	local_cpu_data->pfm_dcr_pp = 1;
#endif
#else
	op_tasklist_toggle_pp(1);
#endif
#endif
	/* set it in my saved state */
	regs = (struct pt_regs *)((unsigned long) current + IA64_STK_OFFSET);
	regs--;
	ia64_psr(regs)->pp = 1;

	/* set it in my current state */
	__asm__ __volatile__ ("ssm psr.pp;;"::: "memory");
	ia64_srlz_d();
}


static void
pmu_stop(void * info)
{
	struct pt_regs * regs;

	if (info && (*((uint *)info) != op_cpu_id()))
		return;

	/* stop in my current state */
	__asm__ __volatile__ ("rsm psr.pp;;"::: "memory");

	/* disable the dcr pp */
	ia64_set_dcr(ia64_get_dcr() & ~IA64_DCR_PP);

#ifdef CONFIG_PERFMON
#ifdef CONFIG_SMP
#if V_AT_LEAST(2, 4, 21)
	local_cpu_data->pfm_syst_info &= ~PFM_CPUINFO_SYST_WIDE;
	local_cpu_data->pfm_syst_info &= ~PFM_CPUINFO_DCR_PP;
	/* FIXME: what todo with the 3rd flags PFM_CPUINFO_EXCL_IDLE 0x4 */
#else
	local_cpu_data->pfm_syst_wide = 0;
	local_cpu_data->pfm_dcr_pp = 0;
#endif
#else
	pfm_tasklist_toggle_pp(0);
#endif
#endif

	/* disable in my saved state */
	regs = (struct pt_regs *)((unsigned long) current + IA64_STK_OFFSET);
	regs--;
	ia64_psr(regs)->pp = 0;
}


static void
pmu_select_start(uint cpu)
{
	if (cpu == op_cpu_id())
		pmu_start(NULL);
	else
		smp_call_function(pmu_start, &cpu, 0, 1);
}


static void
pmu_select_stop(uint cpu)
{
	if (cpu == op_cpu_id())
		pmu_stop(NULL);
	else
		smp_call_function(pmu_stop, &cpu, 0, 1);
}


static void
pmu_start_all(void)
{
	int cpu, i;
 
	for (cpu=0; cpu < smp_num_cpus; cpu++) {
		struct _oprof_data * data = &oprof_data[cpu];

		for (i = 0 ; i < op_nr_counters ; ++i) {
			if (sysctl.ctr[i].enabled) {
				data->ctr_count[i] = sysctl.ctr[i].count;
			} else {
				data->ctr_count[i] = 0;
			}
		}
	}
 
	if (!install_handler()) {
		smp_call_function(pmu_start, NULL, 0, 1);
		pmu_start(NULL);
	}
		/* FIXME need some way to fail here */;
}


static void
pmu_stop_all(void)
{
	smp_call_function(pmu_stop, NULL, 0, 1);
	pmu_stop(NULL);
	restore_handler();
}

 
static int
pmu_check_params(void)
{
	int i;
	int enabled = 0;

	for (i = 0; i < op_nr_counters ; i++) {
		if (!sysctl.ctr[i].enabled)
			continue;

		enabled = 1;

		if (!sysctl.ctr[i].user && !sysctl.ctr[i].kernel) {
			printk(KERN_ERR "oprofile: neither kernel nor user "
			       "set for counter %d\n", i);
			return -EINVAL;
		}

		if (check_range(sysctl.ctr[i].count, 1, OP_MAX_PERF_COUNT,
			"ctr count value %d not in range (%d %ld)\n"))
			return -EINVAL;
	}

	if (!enabled) {
		printk(KERN_ERR "oprofile: no counters have been enabled.\n");
		return -EINVAL;
	}

	return 0;
}


static struct op_msrs cpu_msrs[NR_CPUS];


static void free_msr_group(struct op_msr_group * group)
{
	if (group->addrs)
		kfree(group->addrs);
	if (group->saved)
		kfree(group->saved);
	group->addrs = NULL;
	group->saved = NULL;
}
 

static void pmu_save_registers(void * dummy)
{
	uint i;
	uint const cpu = op_cpu_id();
	struct op_msr_group * counters = &cpu_msrs[cpu].counters;
	struct op_msr_group * controls = &cpu_msrs[cpu].controls;

	counters->addrs = NULL; 
	counters->saved = NULL;
	controls->addrs = NULL;
	controls->saved = NULL;

	counters->saved = kmalloc(
		op_nr_counters * sizeof(struct op_saved_msr), GFP_KERNEL);
	if (!counters->saved)
		goto fault;
 
	controls->saved = kmalloc(
		op_nr_counters * sizeof(struct op_saved_msr), GFP_KERNEL);
	if (!controls->saved)
		goto fault;
 
	for (i = 0; i < op_nr_counters; ++i) {
		controls->saved[i].low = get_pmc(i);
		counters->saved[i].low = get_pmd(i);
	}
	return;

fault:
	free_msr_group(counters);
	free_msr_group(controls);
}
 

static void pmu_restore_registers(void * dummy)
{
	uint i;
	uint const cpu = op_cpu_id();
	struct op_msr_group * counters = &cpu_msrs[cpu].counters;
	struct op_msr_group * controls = &cpu_msrs[cpu].controls;

	for (i = 0; i < op_nr_counters; ++i) {
		set_pmc(controls->saved[i].low, i);
		set_pmd(counters->saved[i].low, i);
	}

	free_msr_group(counters);
	free_msr_group(controls);
}



static int
pmu_init(void)
{
	int err = 0; 

	/* figure out processor type configure number of bits in pmd
	   and number of counters */
	switch (get_cpu_type()) {
	case CPU_IA64_1:
		pmd_mask = IA64_1_PMD_MASK_VAL; break;
	case CPU_IA64_2:
	case CPU_IA64:
		pmd_mask = IA64_2_PMD_MASK_VAL; break;
	default:
		err = -EIO; break;
	}

	op_nr_counters = 4;

	if ((err = smp_call_function(pmu_save_registers, NULL, 0, 1)))
		goto out;

	pmu_save_registers(NULL);

out:
	return err;
}
 

static void
pmu_deinit(void)
{
	smp_call_function(pmu_restore_registers, NULL, 0, 1);
	pmu_restore_registers(NULL);
}
 

static char * names[] = { "0", "1", "2", "3", };


static int
pmu_add_sysctls(ctl_table * next)
{
	ctl_table * start = next; 
	ctl_table * tab; 
	int i, j;
 
	for (i=0; i < op_nr_counters; i++) {
		next->ctl_name = 1;
		next->procname = names[i];
		next->mode = 0700;

		if (!(tab = kmalloc(sizeof(ctl_table)*7, GFP_KERNEL)))
			goto cleanup;
 
		next->child = tab;

		memset(tab, 0, sizeof(ctl_table)*7);
		tab[0] = ((ctl_table) { 1, "enabled", &sysctl_parms.ctr[i].enabled, sizeof(int), 0600, NULL, lproc_dointvec, NULL, });
		tab[1] = ((ctl_table) { 1, "event", &sysctl_parms.ctr[i].event, sizeof(int), 0600, NULL, lproc_dointvec, NULL,  });
		tab[2] = ((ctl_table) { 1, "count", &sysctl_parms.ctr[i].count, sizeof(int), 0600, NULL, lproc_dointvec, NULL, });
		tab[3] = ((ctl_table) { 1, "unit_mask", &sysctl_parms.ctr[i].unit_mask, sizeof(int), 0600, NULL, lproc_dointvec, NULL, });
		tab[4] = ((ctl_table) { 1, "kernel", &sysctl_parms.ctr[i].kernel, sizeof(int), 0600, NULL, lproc_dointvec, NULL, });
		tab[5] = ((ctl_table) { 1, "user", &sysctl_parms.ctr[i].user, sizeof(int), 0600, NULL, lproc_dointvec, NULL, });
		next++;
	}

	return 0;

cleanup:
	next = start;
	for (j = 0; j < i; j++) {
		kfree(next->child);
		next++;
	}
	return -EFAULT;
}


static void pmu_remove_sysctls(ctl_table * next)
{
	int ii;

	for (ii=0; ii < op_nr_counters; ii++) {
		kfree(next->child);
		next++;
	}
}
 

struct op_int_operations op_nmi_ops = {
	init: pmu_init,
	deinit: pmu_deinit,
	add_sysctls: pmu_add_sysctls,
	remove_sysctls: pmu_remove_sysctls,
	check_params: pmu_check_params,
	setup: pmu_setup_all,
	start: pmu_start_all,
	stop: pmu_stop_all,
	start_cpu: pmu_select_start,
	stop_cpu: pmu_select_stop, 
};


struct op_int_operations const * op_int_interface()
{
	return &op_nmi_ops;
}

/* Need this dummy so module/oprofile.c links */
struct op_int_operations op_rtc_ops = {
	init: NULL,
	deinit: NULL,
	add_sysctls: NULL,
	remove_sysctls: NULL,
	check_params: NULL,
	setup: NULL,
	start: NULL,
	stop: NULL,
	start_cpu: NULL,
	stop_cpu: NULL,
};