/**
* @file op_pmu.c
* Setup and handling of IA64 Performance Monitoring Unit (PMU)
*
* @remark Copyright 2002 OProfile authors
* @remark Read the file COPYING
*
* @author Bob Montgomery
* @author Will Cohen
* @author John Levon
* @author Philippe Elie
*/
#include "oprofile.h"
#include "op_util.h"
#include <asm/perfmon.h>
#include "op_ia64_model.h"
/* number of counters physically present */
static uint op_nr_counters = 4;
/* performance counters are in pairs: pmcN and pmdN. The pmc register acts
* as the event selection; the pmd register is the counter. */
#define perf_reg(c) ((c)+4)
#define IA64_1_PMD_MASK_VAL ((1UL << 32) - 1)
#define IA64_2_PMD_MASK_VAL ((1UL << 47) - 1)
/* The appropriate value is selected in pmu_init() */
unsigned long pmd_mask = IA64_2_PMD_MASK_VAL;
#define pmd_overflowed(r, c) ((r) & (1 << perf_reg(c)))
#define set_pmd_neg(v, c) do { \
ia64_set_pmd(perf_reg(c), -(ulong)(v) & pmd_mask); \
ia64_srlz_d(); } while (0)
#define set_pmd(v, c) do { \
ia64_set_pmd(perf_reg(c), (v) & pmd_mask); \
ia64_srlz_d(); } while (0)
#define set_pmc(v, c) do { ia64_set_pmc(perf_reg(c), (v)); ia64_srlz_d(); } while (0)
#define get_pmd(c) ia64_get_pmd(perf_reg(c))
#define get_pmc(c) ia64_get_pmc(perf_reg(c))
/* ---------------- IRQ handler ------------------ */
/* The args match the args for pfm_overflow_handler in perfmon.c.
* The task_struct is currently filled in with the perfmon "owner" of
* the PMU. This might change. I'm not sure it makes sense in perfmon
* either with system-wide profiling.
* pmc0 is a bit mask for overflowed counters (bits 4-7)
* This routine should return 0 to resume interrupts.
*/
inline static void
op_do_pmu_interrupt(u64 pmc0, struct pt_regs * regs)
{
uint cpu = op_cpu_id();
int ctr;
for (ctr = 0 ; ctr < op_nr_counters ; ++ctr) {
if (pmd_overflowed(pmc0, ctr)) {
op_do_profile(cpu, regs->cr_iip, 1, ctr);
set_pmd_neg(oprof_data[cpu].ctr_count[ctr], ctr);
}
}
return;
}
static void
op_raw_pmu_interrupt(int irq, void * arg, struct pt_regs * regs)
{
u64 pmc0;
pmc0 = ia64_get_pmc(0);
if ((pmc0 & ~0x1UL) != 0UL) {
op_do_pmu_interrupt(pmc0, regs);
ia64_set_pmc(0, 0);
ia64_srlz_d();
}
}
#define MY_OPROFILE_VECTOR (IA64_PERFMON_VECTOR - 2)
static void
op_set_pmv(void * dummy)
{
ia64_set_pmv(MY_OPROFILE_VECTOR);
ia64_srlz_d();
}
static void
op_restore_pmv(void* dummy)
{
ia64_set_pmv(IA64_PERFMON_VECTOR);
ia64_srlz_d();
}
static int
install_handler(void)
{
int err = 0;
/* Try it legally - confusion about vec vs irq */
err = request_irq(MY_OPROFILE_VECTOR, op_raw_pmu_interrupt,
SA_INTERRUPT | SA_PERCPU_IRQ, "oprofile", NULL);
if (err) {
printk(KERN_ALERT "oprofile_IA64: request_irq fails, "
"returns %d\n", err);
return err;
}
if ((smp_call_function(op_set_pmv, NULL, 0, 1))) {
printk(KERN_ALERT "oprofile_IA64: unexpected failure "
"of smp_call_function(op_set_pmv)\n");
}
op_set_pmv(NULL);
return err;
}
static int
restore_handler(void)
{
int err = 0;
if ((smp_call_function(op_restore_pmv, NULL, 0, 1))) {
printk(KERN_ALERT "oprofile_IA64: unexpected failure "
"of smp_call_function(op_restore_pmv)\n");
}
op_restore_pmv(NULL);
free_irq(MY_OPROFILE_VECTOR, NULL);
return err;
}
/* ---------------- PMU setup ------------------ */
/* This is kind of artificial. The proc interface might really want to
* accept register values directly. There are other features not exposed
* by this limited interface. Of course that might require all sorts of
* validity checking??? */
static void
pmc_fill_in(ulong * val, u8 kernel, u8 user, u8 event, u8 um)
{
/* enable interrupt generation */
*val |= (1 << 5);
/* setup as a privileged monitor */
*val |= (1 << 6);
/* McKinley requires pmc4 to have bit 23 set (enable PMU).
* It is supposedly ignored in other pmc registers.
* Try assuming it's ignored in Itanium, too, and just
* set it for everyone.
*/
*val |= (1 << 23);
/* enable/disable chosen OS and USR counting */
(user) ? (*val |= (1 << 3))
: (*val &= ~(1 << 3));
(kernel) ? (*val |= (1 << 0))
: (*val &= ~(1 << 0));
/* what are we counting ? */
*val &= ~(0xff << 8);
*val |= ((event & 0xff) << 8);
*val &= ~(0xf << 16);
*val |= ((um & 0xf) << 16);
}
static void
pmu_setup(void * dummy)
{
ulong pmc_val;
int ii;
/* setup each counter */
for (ii = 0 ; ii < op_nr_counters ; ++ii) {
if (sysctl.ctr[ii].enabled) {
pmc_val = 0;
set_pmd_neg(sysctl.ctr[ii].count, ii);
pmc_fill_in(&pmc_val, sysctl.ctr[ii].kernel,
sysctl.ctr[ii].user, sysctl.ctr[ii].event,
sysctl.ctr[ii].unit_mask);
set_pmc(pmc_val, ii);
}
}
}
void
disable_psr(void * dummy)
{
struct pt_regs * regs;
/* disable profiling for my saved state */
regs = (struct pt_regs *)((unsigned long) current + IA64_STK_OFFSET);
regs--;
ia64_psr(regs)->pp = 0;
/* shouldn't need to */
ia64_psr(regs)->up = 0;
/* disable profiling for my current state */
__asm__ __volatile__ ("rsm psr.pp;;"::: "memory");
#if defined(CONFIG_PERFMON) && defined(CONFIG_SMP)
#if V_AT_LEAST(2, 4, 21)
local_cpu_data->pfm_syst_info |= PFM_CPUINFO_SYST_WIDE;
local_cpu_data->pfm_syst_info &= ~PFM_CPUINFO_DCR_PP;
/* FIXME: what todo with the 3rd flags PFM_CPUINFO_EXCL_IDLE 0x4 */
#else
/* disable profiling for everyone else */
local_cpu_data->pfm_syst_wide = 1;
local_cpu_data->pfm_dcr_pp = 0;
#endif
#endif
ia64_set_pmc(0, 0);
ia64_srlz_d();
}
static int
pmu_setup_all(void)
{
/* This would be a great place to reserve all cpus with
* some sort of call to perfmonctl (something like the
* CREATE_CONTEXT command). The current interface to
* perfmonctl wants to be called from a different task id
* for each CPU to be set up (and doesn't allow calls from
* modules.
*/
/* disable profiling with the psr.pp bit */
if ((smp_call_function(disable_psr, NULL, 0, 1)))
return -EFAULT;
disable_psr(NULL);
/* now I've reserved the PMUs and they should be quiet */
if ((smp_call_function(pmu_setup, NULL, 0, 1)))
return -EFAULT;
pmu_setup(NULL);
return 0;
}
#ifndef CONFIG_SMP
/* from linux/arch/ia64/kernel/perfmon.c */
/*
* Originaly Written by Ganesh Venkitachalam, IBM Corp.
* Copyright (C) 1999 Ganesh Venkitachalam <venkitac@us.ibm.com>
*
* Modifications by Stephane Eranian, Hewlett-Packard Co.
* Modifications by David Mosberger-Tang, Hewlett-Packard Co.
*
* Copyright (C) 1999-2002 Hewlett Packard Co
* Stephane Eranian <eranian@hpl.hp.com>
* David Mosberger-Tang <davidm@hpl.hp.com>
*/
/*
* On UP kernels, we do not need to constantly set the psr.pp bit
* when a task is scheduled. The psr.pp bit can only be changed in
* the kernel because of a user request. Given we are on a UP non preeemptive
* kernel we know that no other task is running, so we cna simply update their
* psr.pp from their saved state. There is this no impact on the context switch
* code compared to the SMP case.
*/
static void
op_tasklist_toggle_pp(unsigned int val)
{
struct task_struct * p;
struct pt_regs * regs;
read_lock(&tasklist_lock);
for_each_task(p) {
regs = (struct pt_regs *)((unsigned long) p + IA64_STK_OFFSET);
/*
* position on pt_regs saved on stack on 1st entry into the kernel
*/
regs--;
/*
* update psr.pp
*/
ia64_psr(regs)->pp = val;
}
read_unlock(&tasklist_lock);
}
#endif
static void
pmu_start(void * info)
{
struct pt_regs * regs;
if (info && (*((uint *)info) != op_cpu_id()))
return;
/* printk(KERN_ALERT "oprofile_IA64: pmu_start on cpu %d\n",
op_cpu_id()); */
/* The default control register pp value is copied into psr.pp
* on an interrupt. This allows interrupt service routines to
* be monitored.
*/
ia64_set_dcr(ia64_get_dcr() | IA64_DCR_PP);
#ifdef CONFIG_PERFMON
#ifdef CONFIG_SMP
#if V_AT_LEAST(2, 4, 21)
local_cpu_data->pfm_syst_info |= PFM_CPUINFO_SYST_WIDE;
local_cpu_data->pfm_syst_info |= PFM_CPUINFO_DCR_PP;
/* FIXME: what todo with the 3rd flags PFM_CPUINFO_EXCL_IDLE 0x4 */
#else
local_cpu_data->pfm_syst_wide = 1;
local_cpu_data->pfm_dcr_pp = 1;
#endif
#else
op_tasklist_toggle_pp(1);
#endif
#endif
/* set it in my saved state */
regs = (struct pt_regs *)((unsigned long) current + IA64_STK_OFFSET);
regs--;
ia64_psr(regs)->pp = 1;
/* set it in my current state */
__asm__ __volatile__ ("ssm psr.pp;;"::: "memory");
ia64_srlz_d();
}
static void
pmu_stop(void * info)
{
struct pt_regs * regs;
if (info && (*((uint *)info) != op_cpu_id()))
return;
/* stop in my current state */
__asm__ __volatile__ ("rsm psr.pp;;"::: "memory");
/* disable the dcr pp */
ia64_set_dcr(ia64_get_dcr() & ~IA64_DCR_PP);
#ifdef CONFIG_PERFMON
#ifdef CONFIG_SMP
#if V_AT_LEAST(2, 4, 21)
local_cpu_data->pfm_syst_info &= ~PFM_CPUINFO_SYST_WIDE;
local_cpu_data->pfm_syst_info &= ~PFM_CPUINFO_DCR_PP;
/* FIXME: what todo with the 3rd flags PFM_CPUINFO_EXCL_IDLE 0x4 */
#else
local_cpu_data->pfm_syst_wide = 0;
local_cpu_data->pfm_dcr_pp = 0;
#endif
#else
pfm_tasklist_toggle_pp(0);
#endif
#endif
/* disable in my saved state */
regs = (struct pt_regs *)((unsigned long) current + IA64_STK_OFFSET);
regs--;
ia64_psr(regs)->pp = 0;
}
static void
pmu_select_start(uint cpu)
{
if (cpu == op_cpu_id())
pmu_start(NULL);
else
smp_call_function(pmu_start, &cpu, 0, 1);
}
static void
pmu_select_stop(uint cpu)
{
if (cpu == op_cpu_id())
pmu_stop(NULL);
else
smp_call_function(pmu_stop, &cpu, 0, 1);
}
static void
pmu_start_all(void)
{
int cpu, i;
for (cpu=0; cpu < smp_num_cpus; cpu++) {
struct _oprof_data * data = &oprof_data[cpu];
for (i = 0 ; i < op_nr_counters ; ++i) {
if (sysctl.ctr[i].enabled) {
data->ctr_count[i] = sysctl.ctr[i].count;
} else {
data->ctr_count[i] = 0;
}
}
}
if (!install_handler()) {
smp_call_function(pmu_start, NULL, 0, 1);
pmu_start(NULL);
}
/* FIXME need some way to fail here */;
}
static void
pmu_stop_all(void)
{
smp_call_function(pmu_stop, NULL, 0, 1);
pmu_stop(NULL);
restore_handler();
}
static int
pmu_check_params(void)
{
int i;
int enabled = 0;
for (i = 0; i < op_nr_counters ; i++) {
if (!sysctl.ctr[i].enabled)
continue;
enabled = 1;
if (!sysctl.ctr[i].user && !sysctl.ctr[i].kernel) {
printk(KERN_ERR "oprofile: neither kernel nor user "
"set for counter %d\n", i);
return -EINVAL;
}
if (check_range(sysctl.ctr[i].count, 1, OP_MAX_PERF_COUNT,
"ctr count value %d not in range (%d %ld)\n"))
return -EINVAL;
}
if (!enabled) {
printk(KERN_ERR "oprofile: no counters have been enabled.\n");
return -EINVAL;
}
return 0;
}
static struct op_msrs cpu_msrs[NR_CPUS];
static void free_msr_group(struct op_msr_group * group)
{
if (group->addrs)
kfree(group->addrs);
if (group->saved)
kfree(group->saved);
group->addrs = NULL;
group->saved = NULL;
}
static void pmu_save_registers(void * dummy)
{
uint i;
uint const cpu = op_cpu_id();
struct op_msr_group * counters = &cpu_msrs[cpu].counters;
struct op_msr_group * controls = &cpu_msrs[cpu].controls;
counters->addrs = NULL;
counters->saved = NULL;
controls->addrs = NULL;
controls->saved = NULL;
counters->saved = kmalloc(
op_nr_counters * sizeof(struct op_saved_msr), GFP_KERNEL);
if (!counters->saved)
goto fault;
controls->saved = kmalloc(
op_nr_counters * sizeof(struct op_saved_msr), GFP_KERNEL);
if (!controls->saved)
goto fault;
for (i = 0; i < op_nr_counters; ++i) {
controls->saved[i].low = get_pmc(i);
counters->saved[i].low = get_pmd(i);
}
return;
fault:
free_msr_group(counters);
free_msr_group(controls);
}
static void pmu_restore_registers(void * dummy)
{
uint i;
uint const cpu = op_cpu_id();
struct op_msr_group * counters = &cpu_msrs[cpu].counters;
struct op_msr_group * controls = &cpu_msrs[cpu].controls;
for (i = 0; i < op_nr_counters; ++i) {
set_pmc(controls->saved[i].low, i);
set_pmd(counters->saved[i].low, i);
}
free_msr_group(counters);
free_msr_group(controls);
}
static int
pmu_init(void)
{
int err = 0;
/* figure out processor type configure number of bits in pmd
and number of counters */
switch (get_cpu_type()) {
case CPU_IA64_1:
pmd_mask = IA64_1_PMD_MASK_VAL; break;
case CPU_IA64_2:
case CPU_IA64:
pmd_mask = IA64_2_PMD_MASK_VAL; break;
default:
err = -EIO; break;
}
op_nr_counters = 4;
if ((err = smp_call_function(pmu_save_registers, NULL, 0, 1)))
goto out;
pmu_save_registers(NULL);
out:
return err;
}
static void
pmu_deinit(void)
{
smp_call_function(pmu_restore_registers, NULL, 0, 1);
pmu_restore_registers(NULL);
}
static char * names[] = { "0", "1", "2", "3", };
static int
pmu_add_sysctls(ctl_table * next)
{
ctl_table * start = next;
ctl_table * tab;
int i, j;
for (i=0; i < op_nr_counters; i++) {
next->ctl_name = 1;
next->procname = names[i];
next->mode = 0700;
if (!(tab = kmalloc(sizeof(ctl_table)*7, GFP_KERNEL)))
goto cleanup;
next->child = tab;
memset(tab, 0, sizeof(ctl_table)*7);
tab[0] = ((ctl_table) { 1, "enabled", &sysctl_parms.ctr[i].enabled, sizeof(int), 0600, NULL, lproc_dointvec, NULL, });
tab[1] = ((ctl_table) { 1, "event", &sysctl_parms.ctr[i].event, sizeof(int), 0600, NULL, lproc_dointvec, NULL, });
tab[2] = ((ctl_table) { 1, "count", &sysctl_parms.ctr[i].count, sizeof(int), 0600, NULL, lproc_dointvec, NULL, });
tab[3] = ((ctl_table) { 1, "unit_mask", &sysctl_parms.ctr[i].unit_mask, sizeof(int), 0600, NULL, lproc_dointvec, NULL, });
tab[4] = ((ctl_table) { 1, "kernel", &sysctl_parms.ctr[i].kernel, sizeof(int), 0600, NULL, lproc_dointvec, NULL, });
tab[5] = ((ctl_table) { 1, "user", &sysctl_parms.ctr[i].user, sizeof(int), 0600, NULL, lproc_dointvec, NULL, });
next++;
}
return 0;
cleanup:
next = start;
for (j = 0; j < i; j++) {
kfree(next->child);
next++;
}
return -EFAULT;
}
static void pmu_remove_sysctls(ctl_table * next)
{
int ii;
for (ii=0; ii < op_nr_counters; ii++) {
kfree(next->child);
next++;
}
}
struct op_int_operations op_nmi_ops = {
init: pmu_init,
deinit: pmu_deinit,
add_sysctls: pmu_add_sysctls,
remove_sysctls: pmu_remove_sysctls,
check_params: pmu_check_params,
setup: pmu_setup_all,
start: pmu_start_all,
stop: pmu_stop_all,
start_cpu: pmu_select_start,
stop_cpu: pmu_select_stop,
};
struct op_int_operations const * op_int_interface()
{
return &op_nmi_ops;
}
/* Need this dummy so module/oprofile.c links */
struct op_int_operations op_rtc_ops = {
init: NULL,
deinit: NULL,
add_sysctls: NULL,
remove_sysctls: NULL,
check_params: NULL,
setup: NULL,
start: NULL,
stop: NULL,
start_cpu: NULL,
stop_cpu: NULL,
};