Linux内核 eBPF基础
perf(2):性能管理单元PMU的注册


荣涛
2021年5月18日

1. perf类型

include\uapi\linux\perf_event.h中有:

/*
 * attr.type
 */
enum perf_type_id { /* perf 类型 */
	PERF_TYPE_HARDWARE			= 0,    /* 硬件 */
	PERF_TYPE_SOFTWARE			= 1,    /* 软件 */
	PERF_TYPE_TRACEPOINT		= 2,    /* 跟踪点 */
	PERF_TYPE_HW_CACHE			= 3,    /* 硬件cache */
	PERF_TYPE_RAW				= 4,    /* RAW */
	PERF_TYPE_BREAKPOINT		= 5,    /* 断点 */

	PERF_TYPE_MAX,				/* non-ABI */
};

他们是传入性能管理单元PMU注册函数perf_pmu_register的字段type。列出注册的PMU:

[rongtao@localhost src]$ grep -r "perf_pmu_register" | grep "\""
arch/x86/events/intel/bts.c:	return perf_pmu_register(&bts_pmu, "intel_bts", -1);
arch/x86/events/intel/pt.c:	ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
arch/x86/events/amd/power.c:	ret = perf_pmu_register(&pmu_class, "power", -1);
arch/x86/events/core.c:	err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
arch/x86/events/msr.c:	perf_pmu_register(&pmu_msr, "msr", -1);
arch/x86/events/rapl.c:	ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
kernel/events/hw_breakpoint.c:	perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
kernel/events/core.c:	perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
kernel/events/core.c:	perf_pmu_register(&perf_kprobe, "kprobe", -1);
kernel/events/core.c:	perf_pmu_register(&perf_uprobe, "uprobe", -1);
kernel/events/core.c:	perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);

kernel/events/core.c:	perf_pmu_register(&perf_cpu_clock, NULL, -1);
kernel/events/core.c:	perf_pmu_register(&perf_task_clock, NULL, -1);

2. perf_pmu_register

int perf_pmu_register(struct pmu *pmu, const char *name, int type)

这里需要注意,函数perf_pmu_register是非常重要的注册函数,注册的pmu将加入全局链表pmus中:

static LIST_HEAD(pmus);

函数perf_pmu_register首先申请per-cpu变量:

pmu->pmu_disable_count = alloc_percpu(int);

接着,如果类型不是PERF_TYPE_SOFTWARE,将分配一个ID(前提是name没有设定,如perf_cpu_clock

if (type != PERF_TYPE_SOFTWARE) {
	if (type >= 0)
		max = type;

    /* 分配一个ID */
	ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
	if (ret < 0)
		goto free_pdc;

	WARN_ON(type >= 0 && ret != type);

	type = ret;
}

然后,申请一个设备:

	if (pmu_bus_running/* perf_event_sysfs_init() 中被设置 为 1 */) {
		ret = pmu_dev_alloc(pmu);   /* 分配一个设备 device- /sys/devices/ */
		if (ret)
			goto free_idr;
	}

接下来这段代码表明,每个hw只能注册一次:

	if (pmu->task_ctx_nr == perf_hw_context) {
		static int hw_context_taken = 0;

		/*
		 * Other than systems with heterogeneous CPUs, it never makes
		 * sense for two PMUs to share perf_hw_context. PMUs which are
		 * uncore must use perf_invalid_context.
		 */
		if (WARN_ON_ONCE(hw_context_taken &&
		    !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
			pmu->task_ctx_nr = perf_invalid_context;

		hw_context_taken = 1;
	}

否则,其将被设置为perf_invalid_context。然后为每个CPU分配上下文:

    pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);

紧接着,进行初始化:

	for_each_possible_cpu(cpu) {    /* 遍历 CPU */
		struct perf_cpu_context *cpuctx;

		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);    /* 1.获取 CPU 的ctx */
		__perf_event_init_context(&cpuctx->ctx);            /* 2.初始化这个ctx */
		lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);/*3.初始化lockdep  */
		lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
		cpuctx->ctx.pmu = pmu;                              /* 4.指向这个PMU */
		cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);/* 5.是否在线标记 */

		__perf_mux_hrtimer_init(cpuctx, cpu);               /* 6.高精度定时器,function=perf_mux_hrtimer_handler */

		cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);/*  */
		cpuctx->heap = cpuctx->heap_default;    /* 默认使用2个 */
	}

其中__perf_event_init_context初始化struct perf_event_context结构:

/*
 * Initialize the perf_event context in a task_struct:
 */
static void __perf_event_init_context(struct perf_event_context *ctx)   /* 初始化CPU ctx */
{
	raw_spin_lock_init(&ctx->lock);
	mutex_init(&ctx->mutex);
	INIT_LIST_HEAD(&ctx->active_ctx_list);
	perf_event_groups_init(&ctx->pinned_groups);
	perf_event_groups_init(&ctx->flexible_groups);
	INIT_LIST_HEAD(&ctx->event_list);
	INIT_LIST_HEAD(&ctx->pinned_active);
	INIT_LIST_HEAD(&ctx->flexible_active);
	refcount_set(&ctx->refcount, 1);
}

__perf_mux_hrtimer_init初始化一个高精度定时器,

static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)   /* 高精度定时器 */
{
	struct hrtimer *timer = &cpuctx->hrtimer;
	struct pmu *pmu = cpuctx->ctx.pmu;
	u64 interval;

	/* no multiplexing needed for SW PMU */
	if (pmu->task_ctx_nr == perf_sw_context)
		return;

	/*
	 * check default is sane, if not set then force to
	 * default interval (1/tick)
	 */
	interval = pmu->hrtimer_interval_ms;
	if (interval < 1)
		interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; /* 小于1ms,就让他是 1ms */

	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);

	raw_spin_lock_init(&cpuctx->hrtimer_lock);
	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
	timer->function = perf_mux_hrtimer_handler; /* 处理函数 */
}

需要注意一下几点:

  • 如果是软件上下文perf_sw_context,不创建定时器;
  • 如果ioctl设置的到期时间小于1ms,将其设置为1ms
  • 会调函数为perf_mux_hrtimer_handler

在获取到CPU上下文后,给没有初始化的PMU函数指针赋值:

    /*  */
	if (!pmu->start_txn) {
		if (pmu->pmu_enable) {
			/*
			 * If we have pmu_enable/pmu_disable calls, install
			 * transaction stubs that use that to try and batch
			 * hardware accesses.
			 */
			pmu->start_txn  = perf_pmu_start_txn;
			pmu->commit_txn = perf_pmu_commit_txn;
			pmu->cancel_txn = perf_pmu_cancel_txn;
		} else {
			pmu->start_txn  = perf_pmu_nop_txn;
			pmu->commit_txn = perf_pmu_nop_int;
			pmu->cancel_txn = perf_pmu_nop_void;
		}
	}

    /* 使能 */
	if (!pmu->pmu_enable) {
		pmu->pmu_enable  = perf_pmu_nop_void;
		pmu->pmu_disable = perf_pmu_nop_void;
	}

    /* 检测周期 ioctl(PERF_EVENT_IOC_PERIOD) */
	if (!pmu->check_period)
		pmu->check_period = perf_event_nop_int;

    /*  */
	if (!pmu->event_idx)
		pmu->event_idx = perf_event_idx_default;

下面是将这个PMU添加到pmus链表中:

	/*
	 * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
	 * since these cannot be in the IDR. This way the linear search
	 * is fast, provided a valid software event is provided.
	 */
	if (type == PERF_TYPE_SOFTWARE || !name)
		list_add_rcu(&pmu->entry, &pmus);   /* 软件 或者 name=NULL */
	else
		list_add_tail_rcu(&pmu->entry, &pmus);/*  */

需要注意的是,软件类型的PMU将放到链表开头,以提高线性查询速度。

3. 例: software

//kernel/events/core.c
static struct pmu/* 性能监控单元 */ perf_swevent = {
	.task_ctx_nr	= perf_sw_context,

	.capabilities	= PERF_PMU_CAP_NO_NMI,

	.event_init	= perf_swevent_init,
	.add		= perf_swevent_add,
	.del		= perf_swevent_del,
	.start		= perf_swevent_start,
	.stop		= perf_swevent_stop,
	.read		= perf_swevent_read,
};

perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);

3.1. perf_swevent_init

4. 例: perf_cpu_clock

//kernel/events/core.c
static struct pmu perf_cpu_clock = {
	.task_ctx_nr	= perf_sw_context,

	.capabilities	= PERF_PMU_CAP_NO_NMI,

	.event_init	= cpu_clock_event_init,
	.add		= cpu_clock_event_add,
	.del		= cpu_clock_event_del,
	.start		= cpu_clock_event_start,
	.stop		= cpu_clock_event_stop,
	.read		= cpu_clock_event_read,
};

perf_pmu_register(&perf_cpu_clock, NULL, -1);

5. 例: perf_task_clock

//kernel/events/core.c
static struct pmu perf_task_clock = {
	.task_ctx_nr	= perf_sw_context,

	.capabilities	= PERF_PMU_CAP_NO_NMI,

	.event_init	= task_clock_event_init,
	.add		= task_clock_event_add,
	.del		= task_clock_event_del,
	.start		= task_clock_event_start,
	.stop		= task_clock_event_stop,
	.read		= task_clock_event_read,
};

perf_pmu_register(&perf_task_clock, NULL, -1);

6. 例: kprobe

//kernel/events/core.c
static struct pmu perf_kprobe = {
	.task_ctx_nr	= perf_sw_context,
	.event_init	= perf_kprobe_event_init,
	.add		= perf_trace_add,
	.del		= perf_trace_del,
	.start		= perf_swevent_start,
	.stop		= perf_swevent_stop,
	.read		= perf_swevent_read,
	.attr_groups	= kprobe_attr_groups,
};

perf_pmu_register(&perf_kprobe, "kprobe", -1);

7. 例: tracepoint

//kernel/events/core.c
static struct pmu perf_tracepoint = {
	.task_ctx_nr	= perf_sw_context,

	.event_init	= perf_tp_event_init,
	.add		= perf_trace_add,
	.del		= perf_trace_del,
	.start		= perf_swevent_start,
	.stop		= perf_swevent_stop,
	.read		= perf_swevent_read,
};

perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);

8. pmu->event_init

perf_event_alloc
    perf_init_event
        perf_try_init_event
            pmu->event_init(event);

而调用了perf_event_alloc的有:

  • perf_event_open
  • perf_event_create_kernel_counter
  • fork|clone
kernel_clone
    copy_process
        perf_event_init_task
            perf_event_init_context
                inherit_task_group
                    inherit_group
                        inherit_event
                            perf_event_alloc

9. pmu->add

perf_event_enable
    _perf_event_enable
        __perf_event_enable
            ctx_sched_in
                ctx_flexible_sched_in|ctx_pinned_sched_in
                    merge_sched_in
                        group_sched_in
                            event_sched_in
                                event->pmu->add(event, PERF_EF_START)

10. pmu->del

perf_event_disable
    _perf_event_disable
        __perf_event_disable
            group_sched_out
                event_sched_out
                    event->pmu->del(event, 0);

11. pmu->start

12. pmu->stop

13. pmu->read

14. 相关链接

Logo

更多推荐