全局时钟事件设备(Global Clock Event Device):HPET/PIT

  • 主要负责提供周期时钟,更新 jiffies

  • 全局时钟的角色由一个明选择的局部时钟承担,每个 cpu 都有 local apic,而 global clock 有一个特定的 cpu 承担,全局的时钟事件设备虽然附属于某一个特定的 CPU 上,但是完成的是系统相关的工作,例如完成系统的 tick 更新,说白了就是某个 cpu 一个人接俩活~

  • 结构 struct clock_event_device

局部时钟事件设备(Local Clock Event Device):lapic

  • 每个 CPU 都有一个局部时钟,用作进程统计,最主要的实现了高分辨率定时器(只能工作在提供了lapic 的系统上)

  • 主要完成统计运行在当前 CPU 上的进程的统计,以及设置 Local CPU 下一次中断

  • 结构 struct clock_event_device

时钟设备(tick device)

`</p>

struct tick_device {
        struct clock_event_device *evtdev;
        enum tick_device_mode mode;
};    

enum tick_device_mode {
        TICKDEV_MODE_PERIODIC,
        TICKDEV_MODE_ONESHOT,
};`

tick device 只是 clock_event_device 的一包装器,增加了而外的字段用于指定设备的运行模式(周期或者单触发)。

全局 tick device

`</p>

static struct tick_device tick_broadcast_device;`

tick_broadcast_device 很重要,后面会说到~

 

查看当前系统的 Global Clock Event Device 以及 Local Clock Event Device

`</p>

cat /proc/timer_list`

查看 tick_device

`</p>

cat /proc/timer_list | grep "Clock Event Device"
Clock Event Device: hpet
Clock Event Device: lapic
Clock Event Device: lapic
Clock Event Device: lapic
Clock Event Device: lapic`

查看 event_handler

`</p>

cat /proc/timer_list | grep "event_handler"
 event_handler:  tick_handle_oneshot_broadcast
 event_handler:  hrtimer_interrupt
 event_handler:  hrtimer_interrupt
 event_handler:  hrtimer_interrupt
 event_handler:  hrtimer_interrupt`

从以上信息可以得出 Global Clock Event Device 是 hpet,event_hanler 为 tick_handle_oneshot_broadcast,Local Clock Event Device 是 lapic,event_hanler 为 hrtimer_interrupt,当前系统使用的是高分辨率的 Timer.

event_handler 是什么呢就是中断到来所要执行的函数。(在中断处理程序中被调用)

大概是这个这样的

Global

`</p>

static irqreturn_t timer_interrupt(int irq, void *dev_id)
{
         ......

         global_clock_event->event_handler(global_clock_event);
         // tick_handle_oneshot_broadcast

         ......
}`

Local

`</p>

void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
{
         ......

         local_apic_timer_interrupt();

         ......
}

static void local_apic_timer_interrupt(void)
{
        int cpu = smp_processor_id();
        struct clock_event_device *evt = &per_cpu(lapic_events, cpu);

        ......

        evt->event_handler(evt);
        // hrtimer_interrupt
}`

那 GLobal 与 Local 的 event_handler 是经过怎样的过程而最终得到的呢,下面就以此主线来分析~有关一些基本的概念大家可以参考 《Professional Linux Kernel Architecture》Chapter 15

我觉得要彻底弄懂一个东西就得看源代码,不看是怎么实现的就是看再多书也没用,这是亲身感受~ ok,Let us Go.

 

全局时钟事件设备的注册

`</p>

start_kernel()
  -> if (late_time_init)
           late_time_init()
           x86_late_time_init()
             -> hpet_time_init()
                  -> hpet_enable()
                       -> hpet_legacy_clockevent_register()`

 

static void hpet_legacy_clockevent_register(void)
{
        /* Start HPET legacy interrupts */
        hpet_enable_legacy_int();

        hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC,
                                      hpet_period, hpet_clockevent.shift);
        /* Calculate the min / max delta */
        hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
                                                           &hpet_clockevent);
        /* 5 usec minimum reprogramming delta. */
        hpet_clockevent.min_delta_ns = 5000;

        /*
         * Start hpet with the boot cpu mask and make it
         * global after the IO_APIC has been initialized.
         */
        hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
        // 当前 cpu 肯定是 cpu0
        clockevents_register_device(&hpet_clockevent);
        // 开始注册
        global_clock_event = &hpet_clockevent;
        // Global clock event
        printk(KERN_DEBUG "hpet clockevent registered\n");
}

`</p>

static struct clock_event_device hpet_clockevent = {
        .name           = "hpet",
        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
        .set_mode       = hpet_legacy_set_mode,
        .set_next_event = hpet_legacy_next_event,
        .shift          = 32,
        .irq            = 0,
        .rating         = 50,
};`

 

故事从这就开始了 ……

void clockevents_register_device(struct clock_event_device *dev)
{
        // dev=&hpet_clockevent

        unsigned long flags;

        BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
        BUG_ON(!dev->cpumask);
        // 此时设备必须绑定到某个 CPU 上,此时是 cpu0

        spin_lock_irqsave(&clockevents_lock, flags);

        list_add(&dev->list, &clockevent_devices);
        // 加入到 clockevent_devices 链表中,跟时钟源的注册很像,只要注意以下插入链表的方式就好

        clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);

        clockevents_notify_released();
        // 当 Global Clock Event Device 注册完 hpet 和 cpu0 注册 lapic 之后会被调用.

        spin_unlock_irqrestore(&clockevents_lock, flags);
}

static void clockevents_do_notify(unsigned long reason, void *dev)
{
        // dev 为时钟事件设备 (hpet) 
        raw_notifier_call_chain(&clockevents_chain, reason, dev);
}

// clockevents_chain 已经在 tick_init 函数中初始化
// clockevents_chain->head = &tick_notifier

struct raw_notifier_head {
                struct notifier_block *head;
};

int raw_notifier_call_chain(struct raw_notifier_head *nh,
                                unsigned long val, void *v)
{
                return __raw_notifier_call_chain(nh, val, v, -1, NULL);
}

int __raw_notifier_call_chain(struct raw_notifier_head *nh,
                              unsigned long val, void *v,
                              int nr_to_call, int *nr_calls)
{
        return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
}

static int __kprobes notifier_call_chain(struct notifier_block **nl,
                                        unsigned long val, void *v,
                                        int nr_to_call, int *nr_calls)
{
        int ret = NOTIFY_DONE;
        struct notifier_block *nb, *next_nb;

        nb = rcu_dereference(*nl);
        // nb 指向 notifier_block

        while (nb && nr_to_call) {
                // nr_to_call = -1

                next_nb = rcu_dereference(nb->next);

#ifdef CONFIG_DEBUG_NOTIFIERS
                if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
                        WARN(1, "Invalid notifier called!");
                        nb = next_nb;
                        continue;
                }
#endif
                ret = nb->notifier_call(nb, val, v);
                // 调用 tick_notify 函数 val = CLOCK_EVT_NOTIFY_ADD 
                // 返回 ret = NOTIFY_STOP

                if (nr_calls)
                        (*nr_calls)++;
                // nr_calls = NULL;

                if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
                        break;
                nb = next_nb;
                nr_to_call--;
        }
        return ret;
}

#define NOTIFY_STOP_MASK        0x8000          /* Don't call further */
#define NOTIFY_DONE             0x0000          /* Don't care */
#define NOTIFY_OK               0x0001          /* Suits me */
#define NOTIFY_STOP             (NOTIFY_OK|NOTIFY_STOP_MASK) 

static int tick_notify(struct notifier_block *nb, unsigned long reason,
                               void *dev)
{
        switch (reason) {

        case CLOCK_EVT_NOTIFY_ADD:
                return tick_check_new_device(dev);
                // 这里

        case CLOCK_EVT_NOTIFY_BROADCAST_ON:
        case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
        case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
                tick_broadcast_on_off(reason, dev);
                break;

        case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
        case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
                tick_broadcast_oneshot_control(reason);
                break;

        case CLOCK_EVT_NOTIFY_CPU_DYING:
                tick_handover_do_timer(dev);
                break;

        case CLOCK_EVT_NOTIFY_CPU_DEAD:
                tick_shutdown_broadcast_oneshot(dev);
                tick_shutdown_broadcast(dev);
                tick_shutdown(dev);
                break;

        case CLOCK_EVT_NOTIFY_SUSPEND:
                tick_suspend();
                tick_suspend_broadcast();
                break;

        case CLOCK_EVT_NOTIFY_RESUME:
                tick_resume();
                break;

        default:
                break;
        }

        return NOTIFY_OK;
}

 

接下来的两个函数很重要,当时看的时候丈二和尚摸不着头脑,但是经过各种 debug kernel,各种 brew,终于看出了点门道~

static int tick_check_new_device(struct clock_event_device *newdev)
{
        // newdev = &hpet_clockevent;

        struct clock_event_device *curdev;
        struct tick_device *td;
        int cpu, ret = NOTIFY_OK;
        unsigned long flags;

        spin_lock_irqsave(&tick_device_lock, flags);

        cpu = smp_processor_id();
        if (!cpumask_test_cpu(cpu, newdev->cpumask))
                goto out_bc;
        // cpumask:cpumask to indicate for which CPUs this device works

        td = &per_cpu(tick_cpu_device, cpu);
        // td 时钟设备,tick_cpu_device 是一个每CPU链表,包含了系统中每个CPU对应的struct tick_device 实例.关于每 CPU 变量这里就不说了,因为也不是一两句话能说明白的。

        curdev = td->evtdev;
        // 此时由于刚注册时钟设备上没有时钟事件设备,所以 curdev 为 NULL,而之后发生的可完全不一样那时 curdev 不为空,到时候再说

        /* cpu local device ? */
        if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) {

                /*
                 * If the cpu affinity of the device interrupt can not
                 * be set, ignore it.
                 */
                if (!irq_can_set_affinity(newdev->irq))
                        goto out_bc;

                /*
                 * If we have a cpu local device already, do not replace it
                 * by a non cpu local device
                 */
                if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
                        goto out_bc;
        } 
        // 这几行代码没太看懂貌似根本就没有被执行?

        /*
         * If we have an active device, then check the rating and the oneshot
         * feature.
         */
        if (curdev) {
                // 2.curdev = hpet
                // 2.newdev = lapic

                // 3.curdev = lapic
                // 3.newdev = hpet

                /*
                 * Prefer one shot capable devices !
                 */
                if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) &&
                    !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
                        goto out_bc;
                /*
                 * Check the rating
                 */
                if (curdev->rating >= newdev->rating)
                        goto out_bc;

                // lapic rating = 100,hpet rating = 50 
        }
        // curdev = NULL ,ignore this "if",这块是个关键点

        /*
         * Replace the eventually existing device by the new
         * device. If the current device is the broadcast device, do
         * not give it back to the clockevents layer !
         */
        if (tick_is_broadcast_device(curdev)) {
                clockevents_shutdown(curdev);
                curdev = NULL;
        }

        clockevents_exchange_device(curdev, newdev);

        tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
        // 建立 clock event device

        if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
                tick_oneshot_notify();
                // here

        spin_unlock_irqrestore(&tick_device_lock, flags);
        return NOTIFY_STOP;

out_bc:
        /*
         * Can the new device be used as a broadcast device ?
         */
        if (tick_check_broadcast_device(newdev))
                ret = NOTIFY_STOP;

        spin_unlock_irqrestore(&tick_device_lock, flags);

        return ret;
}

 

int tick_is_broadcast_device(struct clock_event_device *dev)
{
        return (dev && tick_broadcast_device.evtdev == dev);
}

void clockevents_exchange_device(struct clock_event_device *old,
                                 struct clock_event_device *new)
{
        // <condition 1>
        //old = NULL
        //new = &hpet_clockevent 

        // <condition 2>
        // old = hpet 
        // new = lapic

        unsigned long flags;

        local_irq_save(flags);
        /*
         * Caller releases a clock event device. We queue it into the
         * released list and do a notify add later.
         */
        if (old) {
                clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
                list_del(&old->list);
                list_add(&old->list, &clockevents_released);
                // 加到 clockevents_released 链表中
        }

        if (new) {
                BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
                clockevents_shutdown(new);
        }
        local_irq_restore(flags);
        // 此时我们考虑 condition 1
}

 

static void tick_setup_device(struct tick_device *td, 
                              struct clock_event_device *newdev, int cpu, 
                              const struct cpumask *cpumask)
{
        ktime_t next_event;
        void (*handler)(struct clock_vent_device *) = NULL;

        /*   
         * First device setup ?
         */
        if (!td->evtdev) {
        // 此时钟设备没有相关的时钟事件设备
     
                /*   
                 * If no cpu took the do_timer update, assign it to
                 * this cpu:
                 */
                if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
                        // 如果没有选定时钟设备来承担全局时钟设备的角色,那么将选择当前设备来承担此职责

                        tick_do_timer_cpu = cpu; 
                        // 设置为当前设备所属处理器编号
                        tick_next_period = ktime_get();

                        tick_period = ktime_set(0, NSEC_PER_SEC / HZ); 
                        // 时钟周期,纳秒
                        //HZ = 1000 
                }    

                /*   
                 * Startup in periodic mode first.
                 */
                td->mode = TICKDEV_MODE_PERIODIC;
                // 设备运行模式 --> 周期模式

        } else {
                // 关于这个 else ,我们某天会来到这里,现在 ignore it.
                handler = td->evtdev->event_handler;
                next_event = td->evtdev->next_event;
                td->evtdev->event_handler = clockevents_handle_noop;
        }    

        td->evtdev = newdev;
        //为时钟设备指定事件设备

        /*
         * When the device is not per cpu, pin the interrupt to the
         * current cpu:
         */
        if (!cpumask_equal(newdev->cpumask, cpumask))
                irq_set_affinity(newdev->irq, cpumask);

        /*
         * When global broadcasting is active, check if the current
         * device is registered as a placeholder for broadcast mode.
         * This allows us to handle this x86 misfeature in a generic
         * way.
         */
        // check whether enable the broadcast mode,如果系统处于省电模式,而局部时钟停止工作,则会使用广播机制
        if (tick_device_uses_broadcast(newdev, cpu))
                return;

        if (td->mode == TICKDEV_MODE_PERIODIC)
                tick_setup_periodic(newdev, 0);
                // 周期模式 invoke this ......
        else
                tick_setup_oneshot(newdev, handler, next_event);
                // 单触发模式
}

 

void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
{
        tick_set_periodic_handler(dev, broadcast);
        // broadcast = 0

        /* Broadcast setup ? */
        if (!tick_device_is_functional(dev))
                return;

        if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
            !tick_broadcast_oneshot_active()) {
                clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
                // here
                // 设置成周期模式
        } else {
                unsigned long seq;
                ktime_t next;

                do {
                        seq = read_seqbegin(&xtime_lock);
                        next = tick_next_period;
                } while (read_seqretry(&xtime_lock, seq));

                clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);

                for (;;) {
                        if (!clockevents_program_event(dev, next, ktime_get()))
                                return;
                        next = ktime_add(next, tick_period);
                }
        }
}

void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
{
        if (!broadcast)
                dev->event_handler = tick_handle_periodic;
                // here
        else
                dev->event_handler = tick_handle_periodic_broadcast;
}

static inline int tick_device_is_functional(struct clock_event_device *dev)
{
        return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
}

此时 Global event_handler 的注册就接近尾声了,event_handler = tick_handle_periodic,不对啊应该是 “tick_handle_oneshot_broadcast”啊,是啊,莫急,故事远没有结束这才是一个开始 ……