本地时钟事件设备的注册

2012-03-18 20:02:54 +0000

话说上一回合已经完成了Global事件设备(hpet)的注册,现在来看下Local时钟事件的注册,就以 CPU0 为例.

在 kernel 初始化的末期,内核线程 1 中(还没有 up 其他 cpu),在 log 中出现了类似“CPU0: Intel(R) Xeon(TM) CPU 3.20GHz stepping 04”,发生了这样一个故事……

局部时钟事件设备的注册

`</p>

kernel_init()
  -> smp_prepare_cpus(setup_max_cpus)
     native_smp_prepare_cpus(64)
       -> x86_init.timers.setup_percpu_clockev()
          setup_boot_APIC_clock()
            -> setup_APIC_timer()
                 -> clockevents_register_device(levt)`

 

static void __cpuinit setup_APIC_timer(void)
{
        struct clock_event_device *levt = &__get_cpu_var(lapic_events);

        if (cpu_has(&current_cpu_data, X86_FEATURE_ARAT)) {
                lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
                /* Make LAPIC timer preferrable over percpu HPET */
                lapic_clockevent.rating = 150;
        }

        memcpy(levt, &lapic_clockevent, sizeof(*levt));
        levt->cpumask = cpumask_of(smp_processor_id());

        clockevents_register_device(levt);
        // 注册 CPU0 Local Clock Event Device
}

 

`</p>

 +----------------------------------------------------------------------------------+
 |      /*                                                                          |
 |       * The local apic timer can be used for any function which is CPU local.    |
 |       */                                                                         |
 |      static struct clock_event_device lapic_clockevent = {                       |
 |              .name           = "lapic",                                          |
 |              .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT  |
 |                              | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,     |
 |              .shift          = 32,                                               |
 |              .set_mode       = lapic_timer_setup,                                |
 |              .set_next_event = lapic_next_event,                                 |
 |              .broadcast      = lapic_timer_broadcast,                            |
 |              .rating         = 100,                                              |
 |              .irq            = -1,                                               |
 |      };                                                                          |
 |                                                                                  |
 +----------------------------------------------------------------------------------+
` 

 

故事从这里就又开始了

void clockevents_register_device(struct clock_event_device *dev)
{
        // dev=&lapic_clockevent
 
        unsigned long flags;
 
        BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
        BUG_ON(!dev->cpumask);
        // 此时设备必须绑定到某个 CPU 上,此时还是 cpu0
 
        spin_lock_irqsave(&clockevents_lock, flags);
 
        list_add(&dev->list, &clockevent_devices);
        // 加入到 clockevent_devices 链表中,跟时钟源的注册很像,只要注意以下插入链表的方式就好
 
        clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
 
        clockevents_notify_released();
        // 当 Global Clock Event Device 注册完 hpet 和 cpu0 注册 lapic 之后会被调用.
 
        spin_unlock_irqrestore(&clockevents_lock, flags);
}

        ......

static int tick_check_new_device(struct clock_event_device *newdev)
{
        ......

        cpu = smp_processor_id();
        // 还是 CPU0
        if (!cpumask_test_cpu(cpu, newdev->cpumask))
                goto out_bc;
        // cpumask:            cpumask to indicate for which CPUs this device works

        td = &per_cpu(tick_cpu_device, cpu);
        // 还是 CPU0 的那个 struct tick_device 实例

        curdev = td->evtdev;
        // 注意此时 curdev 并不是 NULL 而是指向 hpet_clockevent

        ......

        if (curdev) {
                // curdev = &hpet_clockevent
                // newdev = &lapic_clockevent

                /*
                 * Prefer one shot capable devices !
                 */
                if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) &&
                    !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
                        goto out_bc;
                /*
                 * Check the rating
                 */
                if (curdev->rating >= newdev->rating)
                        goto out_bc;

                // lapic rating = 100,hpet rating = 50 

                // 此处需要说明一下:一个 CPU 的一个 tick_device 只能对应一个 clock event device ,但是此时由于 CPU0 的 tick_device 上已经有了用作 global event device 的 hpet,所以此时就应该确定一下要选哪个根据 clcok event device 是否支持 one-shot mode 和 rating 值,此时顺序执行。
        }

        /*
         * Replace the eventually existing device by the new
         * device. If the current device is the broadcast device, do
         * not give it back to the clockevents layer !
         */
        if (tick_is_broadcast_device(curdev)) {
                clockevents_shutdown(curdev);
                curdev = NULL;
        }

        clockevents_exchange_device(curdev, newdev);

        tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
        // 设置 lapic_clockevent 的 event_handler 为 tick_handle_periodic

        ......        

        return NOTIFY_STOP;

out_bc:
        /*
         * Can the new device be used as a broadcast device ?
         */
        if (tick_check_broadcast_device(newdev))
                ret = NOTIFY_STOP;

        spin_unlock_irqrestore(&tick_device_lock, flags);

        return ret;
}

 

static void tick_setup_device(struct tick_device *td,
                              struct clock_event_device *newdev, int cpu,
                              const struct cpumask *cpumask)
{
        ......
        
        // 回到前文留下伏笔的 else
        else {
                handler = td->evtdev->event_handler;
                // handler 应该为 tick_handle_periodic

                next_event = td->evtdev->next_event;
                td->evtdev->event_handler = clockevents_handle_noop;
                // 设置 hpet_clockevent 的 event_handler 为 clockevents_handle_noop.
        }

        ......
}

 

void clockevents_exchange_device(struct clock_event_device *old,
                                 struct clock_event_device *new)
{
        // old = hpet
        // new = lapic
 
        unsigned long flags;
 
        local_irq_save(flags);
        /*
         * Caller releases a clock event device. We queue it into the
         * released list and do a notify add later.
         */
        if (old) {
                clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
                // 设置为 UNUSED mode
                list_del(&old->list);
                // 从原来的 clockevent_devices 中删除
                list_add(&old->list, &clockevents_released);
                // 加到 clockevents_released 链表中
        }
 
        if (new) {
                BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
                clockevents_shutdown(new);
        }
        local_irq_restore(flags);
}

 

好了,现在回到了 clockevents_register_device() 中的 clockevents_notify_released() 函数。

static void clockevents_notify_released(void)
{
        struct clock_event_device *dev;

        while (!list_empty(&clockevents_released)) {
                // 为空 list_empty 返回 1,如果 clockevents_released 链表不为空则进入循环
                // 现在链表中的设备为 hpet

                dev = list_entry(clockevents_released.next,
                                 struct clock_event_device, list);
                // dev 指向 hpet_clockevent  

                list_del(&dev->list);
                list_add(&dev->list, &clockevent_devices);
                // 再次加入到 clockevent_devices 链表
                clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
                // 再次设置 event_handler
        }
        // 在注册 global clock_event device 时因为 clockevents_released 链表为空,根本就没有进入循环体,但是现在不一样了。
}

 

故事从这又 TMD 的开始了……

现在 CPU0 的 tick_device 上的 clock_event device 是 local 的 (lapic_clockevent),现在就是要处理全局的~

static int tick_check_new_device(struct clock_event_device *newdev)
{
        ......

        cpu = smp_processor_id();
        // 还是 CPU0
        if (!cpumask_test_cpu(cpu, newdev->cpumask))
                goto out_bc;
        // cpumask:            cpumask to indicate for which CPUs this device works

        td = &per_cpu(tick_cpu_device, cpu);
        // 还是 CPU0 的那个 struct tick_device 实例

        curdev = td->evtdev;
        // 注意此时 curdev 并不是 NULL 而是指向 lapic_clockevent

        ......

        if (curdev) {
                // curdev = &lapic_clockevent
                // newdev = &hpet_clockevent

                /*
                 * Prefer one shot capable devices !
                 */
                if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) &&
                    !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
                        goto out_bc;
                /*
                 * Check the rating
                 */
                if (curdev->rating >= newdev->rating)
                        goto out_bc;
                        // 没办法只能去检查 hpet_clockevent 能否用作 broadcast device,因为它的 rating 值小于已经注册上的 lapic_clockevent

                // lapic rating = 100,hpet rating = 50 
                
        }

        ......

out_bc:
        /*
         * Can the new device be used as a broadcast device ?
         */
        if (tick_check_broadcast_device(newdev))
                ret = NOTIFY_STOP;

        spin_unlock_irqrestore(&tick_device_lock, flags);

        return ret;
}

 

int tick_check_broadcast_device(struct clock_event_device *dev)
{
        if ((tick_broadcast_device.evtdev &&
             tick_broadcast_device.evtdev->rating >= dev->rating) ||
             (dev->features & CLOCK_EVT_FEAT_C3STOP))
                return 0;

        clockevents_exchange_device(NULL, dev);

        tick_broadcast_device.evtdev = dev;
        // tick_broadcast_device 终于出现了,设置为 hpet

        if (!cpumask_empty(tick_get_broadcast_mask()))
                tick_broadcast_start_periodic(dev);
                // 设置周期模式,此函数在当前 context 中并没有被执行,也就是说 hpet_clockevent 的 event_handler 应该是 "clockevents_handle_noop"
        return 1;
}

 

好现现在说说现在所处的situation,Global 以及 CPU0 的 LAPIC 都已经注册完毕,假设系统 4-core,现在处于 kernel 初始化末期线程1中,马上要进行的是启动其他额外 CPU(1-3),并且注册相应的 LAPIC clock_event device.

kernel_init()
  -> smp_init()

static void __init smp_init(void)
{       
        unsigned int cpu;
        
        /* FIXME: This should be done in userspace --RR */
        for_each_present_cpu(cpu) {
                if (num_online_cpus() >= setup_max_cpus)
                        break;
                if (!cpu_online(cpu))
                        cpu_up(cpu);
        }
        // 启动其他 CPU 调用 start_secondary()
                
        /* Any cleanup work */
        printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); 
        smp_cpus_done(setup_max_cpus);
                
        ......
}

 

局部时钟事件设备的注册

`</p>

cpu_up()
  ...
  -> _cpu_up()
     smp_ops.cpu_up(cpu)
       -> native_cpu_up()
            -> do_boot_cpu()
                 -> wakeup_secondary_cpu_via_init()
                      -> startup_ipi_hook()
                         ......
                         maybe (这块没太看懂怎么 invoke 的 start_secondary)
                         start_secondary()
                           -> setup_secondary_APIC_clock()
                                -> setup_APIC_timer()
                                     -> clockevents_register_device(levt)`

 

其余的 CPU Lapic 注册过程和 CPU0 的一样也不复杂,最终都把相关 event_handler 初始化为 “tick_handle_periodic”

可是 …… 但是 …… 还是那句话,,故事远没有结束这才是一个开始 ……

全局时钟事件设备的注册

2012-03-13 23:09:39 +0000

全局时钟事件设备(Global Clock Event Device):HPET/PIT

  • 主要负责提供周期时钟,更新 jiffies

  • 全局时钟的角色由一个明选择的局部时钟承担,每个 cpu 都有 local apic,而 global clock 有一个特定的 cpu 承担,全局的时钟事件设备虽然附属于某一个特定的 CPU 上,但是完成的是系统相关的工作,例如完成系统的 tick 更新,说白了就是某个 cpu 一个人接俩活~

  • 结构 struct clock_event_device

局部时钟事件设备(Local Clock Event Device):lapic

  • 每个 CPU 都有一个局部时钟,用作进程统计,最主要的实现了高分辨率定时器(只能工作在提供了lapic 的系统上)

  • 主要完成统计运行在当前 CPU 上的进程的统计,以及设置 Local CPU 下一次中断

  • 结构 struct clock_event_device

时钟设备(tick device)

`</p>

struct tick_device {
        struct clock_event_device *evtdev;
        enum tick_device_mode mode;
};    

enum tick_device_mode {
        TICKDEV_MODE_PERIODIC,
        TICKDEV_MODE_ONESHOT,
};`

tick device 只是 clock_event_device 的一包装器,增加了而外的字段用于指定设备的运行模式(周期或者单触发)。

全局 tick device

`</p>

static struct tick_device tick_broadcast_device;`

tick_broadcast_device 很重要,后面会说到~

 

查看当前系统的 Global Clock Event Device 以及 Local Clock Event Device

`</p>

cat /proc/timer_list`

查看 tick_device

`</p>

cat /proc/timer_list | grep "Clock Event Device"
Clock Event Device: hpet
Clock Event Device: lapic
Clock Event Device: lapic
Clock Event Device: lapic
Clock Event Device: lapic`

查看 event_handler

`</p>

cat /proc/timer_list | grep "event_handler"
 event_handler:  tick_handle_oneshot_broadcast
 event_handler:  hrtimer_interrupt
 event_handler:  hrtimer_interrupt
 event_handler:  hrtimer_interrupt
 event_handler:  hrtimer_interrupt`

从以上信息可以得出 Global Clock Event Device 是 hpet,event_hanler 为 tick_handle_oneshot_broadcast,Local Clock Event Device 是 lapic,event_hanler 为 hrtimer_interrupt,当前系统使用的是高分辨率的 Timer.

event_handler 是什么呢就是中断到来所要执行的函数。(在中断处理程序中被调用)

大概是这个这样的

Global

`</p>

static irqreturn_t timer_interrupt(int irq, void *dev_id)
{
         ......

         global_clock_event->event_handler(global_clock_event);
         // tick_handle_oneshot_broadcast

         ......
}`

Local

`</p>

void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
{
         ......

         local_apic_timer_interrupt();

         ......
}

static void local_apic_timer_interrupt(void)
{
        int cpu = smp_processor_id();
        struct clock_event_device *evt = &per_cpu(lapic_events, cpu);

        ......

        evt->event_handler(evt);
        // hrtimer_interrupt
}`

那 GLobal 与 Local 的 event_handler 是经过怎样的过程而最终得到的呢,下面就以此主线来分析~有关一些基本的概念大家可以参考 《Professional Linux Kernel Architecture》Chapter 15

我觉得要彻底弄懂一个东西就得看源代码,不看是怎么实现的就是看再多书也没用,这是亲身感受~ ok,Let us Go.

 

全局时钟事件设备的注册

`</p>

start_kernel()
  -> if (late_time_init)
           late_time_init()
           x86_late_time_init()
             -> hpet_time_init()
                  -> hpet_enable()
                       -> hpet_legacy_clockevent_register()`

 

static void hpet_legacy_clockevent_register(void)
{
        /* Start HPET legacy interrupts */
        hpet_enable_legacy_int();

        hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC,
                                      hpet_period, hpet_clockevent.shift);
        /* Calculate the min / max delta */
        hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
                                                           &hpet_clockevent);
        /* 5 usec minimum reprogramming delta. */
        hpet_clockevent.min_delta_ns = 5000;

        /*
         * Start hpet with the boot cpu mask and make it
         * global after the IO_APIC has been initialized.
         */
        hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
        // 当前 cpu 肯定是 cpu0
        clockevents_register_device(&hpet_clockevent);
        // 开始注册
        global_clock_event = &hpet_clockevent;
        // Global clock event
        printk(KERN_DEBUG "hpet clockevent registered\n");
}

`</p>

static struct clock_event_device hpet_clockevent = {
        .name           = "hpet",
        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
        .set_mode       = hpet_legacy_set_mode,
        .set_next_event = hpet_legacy_next_event,
        .shift          = 32,
        .irq            = 0,
        .rating         = 50,
};`

 

故事从这就开始了 ……

void clockevents_register_device(struct clock_event_device *dev)
{
        // dev=&hpet_clockevent

        unsigned long flags;

        BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
        BUG_ON(!dev->cpumask);
        // 此时设备必须绑定到某个 CPU 上,此时是 cpu0

        spin_lock_irqsave(&clockevents_lock, flags);

        list_add(&dev->list, &clockevent_devices);
        // 加入到 clockevent_devices 链表中,跟时钟源的注册很像,只要注意以下插入链表的方式就好

        clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);

        clockevents_notify_released();
        // 当 Global Clock Event Device 注册完 hpet 和 cpu0 注册 lapic 之后会被调用.

        spin_unlock_irqrestore(&clockevents_lock, flags);
}

static void clockevents_do_notify(unsigned long reason, void *dev)
{
        // dev 为时钟事件设备 (hpet) 
        raw_notifier_call_chain(&clockevents_chain, reason, dev);
}

// clockevents_chain 已经在 tick_init 函数中初始化
// clockevents_chain->head = &tick_notifier

struct raw_notifier_head {
                struct notifier_block *head;
};

int raw_notifier_call_chain(struct raw_notifier_head *nh,
                                unsigned long val, void *v)
{
                return __raw_notifier_call_chain(nh, val, v, -1, NULL);
}

int __raw_notifier_call_chain(struct raw_notifier_head *nh,
                              unsigned long val, void *v,
                              int nr_to_call, int *nr_calls)
{
        return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
}

static int __kprobes notifier_call_chain(struct notifier_block **nl,
                                        unsigned long val, void *v,
                                        int nr_to_call, int *nr_calls)
{
        int ret = NOTIFY_DONE;
        struct notifier_block *nb, *next_nb;

        nb = rcu_dereference(*nl);
        // nb 指向 notifier_block

        while (nb && nr_to_call) {
                // nr_to_call = -1

                next_nb = rcu_dereference(nb->next);

#ifdef CONFIG_DEBUG_NOTIFIERS
                if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
                        WARN(1, "Invalid notifier called!");
                        nb = next_nb;
                        continue;
                }
#endif
                ret = nb->notifier_call(nb, val, v);
                // 调用 tick_notify 函数 val = CLOCK_EVT_NOTIFY_ADD 
                // 返回 ret = NOTIFY_STOP

                if (nr_calls)
                        (*nr_calls)++;
                // nr_calls = NULL;

                if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
                        break;
                nb = next_nb;
                nr_to_call--;
        }
        return ret;
}

#define NOTIFY_STOP_MASK        0x8000          /* Don't call further */
#define NOTIFY_DONE             0x0000          /* Don't care */
#define NOTIFY_OK               0x0001          /* Suits me */
#define NOTIFY_STOP             (NOTIFY_OK|NOTIFY_STOP_MASK) 

static int tick_notify(struct notifier_block *nb, unsigned long reason,
                               void *dev)
{
        switch (reason) {

        case CLOCK_EVT_NOTIFY_ADD:
                return tick_check_new_device(dev);
                // 这里

        case CLOCK_EVT_NOTIFY_BROADCAST_ON:
        case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
        case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
                tick_broadcast_on_off(reason, dev);
                break;

        case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
        case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
                tick_broadcast_oneshot_control(reason);
                break;

        case CLOCK_EVT_NOTIFY_CPU_DYING:
                tick_handover_do_timer(dev);
                break;

        case CLOCK_EVT_NOTIFY_CPU_DEAD:
                tick_shutdown_broadcast_oneshot(dev);
                tick_shutdown_broadcast(dev);
                tick_shutdown(dev);
                break;

        case CLOCK_EVT_NOTIFY_SUSPEND:
                tick_suspend();
                tick_suspend_broadcast();
                break;

        case CLOCK_EVT_NOTIFY_RESUME:
                tick_resume();
                break;

        default:
                break;
        }

        return NOTIFY_OK;
}

 

接下来的两个函数很重要,当时看的时候丈二和尚摸不着头脑,但是经过各种 debug kernel,各种 brew,终于看出了点门道~

static int tick_check_new_device(struct clock_event_device *newdev)
{
        // newdev = &hpet_clockevent;

        struct clock_event_device *curdev;
        struct tick_device *td;
        int cpu, ret = NOTIFY_OK;
        unsigned long flags;

        spin_lock_irqsave(&tick_device_lock, flags);

        cpu = smp_processor_id();
        if (!cpumask_test_cpu(cpu, newdev->cpumask))
                goto out_bc;
        // cpumask:cpumask to indicate for which CPUs this device works

        td = &per_cpu(tick_cpu_device, cpu);
        // td 时钟设备,tick_cpu_device 是一个每CPU链表,包含了系统中每个CPU对应的struct tick_device 实例.关于每 CPU 变量这里就不说了,因为也不是一两句话能说明白的。

        curdev = td->evtdev;
        // 此时由于刚注册时钟设备上没有时钟事件设备,所以 curdev 为 NULL,而之后发生的可完全不一样那时 curdev 不为空,到时候再说

        /* cpu local device ? */
        if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) {

                /*
                 * If the cpu affinity of the device interrupt can not
                 * be set, ignore it.
                 */
                if (!irq_can_set_affinity(newdev->irq))
                        goto out_bc;

                /*
                 * If we have a cpu local device already, do not replace it
                 * by a non cpu local device
                 */
                if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
                        goto out_bc;
        } 
        // 这几行代码没太看懂貌似根本就没有被执行?

        /*
         * If we have an active device, then check the rating and the oneshot
         * feature.
         */
        if (curdev) {
                // 2.curdev = hpet
                // 2.newdev = lapic

                // 3.curdev = lapic
                // 3.newdev = hpet

                /*
                 * Prefer one shot capable devices !
                 */
                if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) &&
                    !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
                        goto out_bc;
                /*
                 * Check the rating
                 */
                if (curdev->rating >= newdev->rating)
                        goto out_bc;

                // lapic rating = 100,hpet rating = 50 
        }
        // curdev = NULL ,ignore this "if",这块是个关键点

        /*
         * Replace the eventually existing device by the new
         * device. If the current device is the broadcast device, do
         * not give it back to the clockevents layer !
         */
        if (tick_is_broadcast_device(curdev)) {
                clockevents_shutdown(curdev);
                curdev = NULL;
        }

        clockevents_exchange_device(curdev, newdev);

        tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
        // 建立 clock event device

        if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
                tick_oneshot_notify();
                // here

        spin_unlock_irqrestore(&tick_device_lock, flags);
        return NOTIFY_STOP;

out_bc:
        /*
         * Can the new device be used as a broadcast device ?
         */
        if (tick_check_broadcast_device(newdev))
                ret = NOTIFY_STOP;

        spin_unlock_irqrestore(&tick_device_lock, flags);

        return ret;
}

 

int tick_is_broadcast_device(struct clock_event_device *dev)
{
        return (dev && tick_broadcast_device.evtdev == dev);
}

void clockevents_exchange_device(struct clock_event_device *old,
                                 struct clock_event_device *new)
{
        // <condition 1>
        //old = NULL
        //new = &hpet_clockevent 

        // <condition 2>
        // old = hpet 
        // new = lapic

        unsigned long flags;

        local_irq_save(flags);
        /*
         * Caller releases a clock event device. We queue it into the
         * released list and do a notify add later.
         */
        if (old) {
                clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
                list_del(&old->list);
                list_add(&old->list, &clockevents_released);
                // 加到 clockevents_released 链表中
        }

        if (new) {
                BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
                clockevents_shutdown(new);
        }
        local_irq_restore(flags);
        // 此时我们考虑 condition 1
}

 

static void tick_setup_device(struct tick_device *td, 
                              struct clock_event_device *newdev, int cpu, 
                              const struct cpumask *cpumask)
{
        ktime_t next_event;
        void (*handler)(struct clock_vent_device *) = NULL;

        /*   
         * First device setup ?
         */
        if (!td->evtdev) {
        // 此时钟设备没有相关的时钟事件设备
     
                /*   
                 * If no cpu took the do_timer update, assign it to
                 * this cpu:
                 */
                if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
                        // 如果没有选定时钟设备来承担全局时钟设备的角色,那么将选择当前设备来承担此职责

                        tick_do_timer_cpu = cpu; 
                        // 设置为当前设备所属处理器编号
                        tick_next_period = ktime_get();

                        tick_period = ktime_set(0, NSEC_PER_SEC / HZ); 
                        // 时钟周期,纳秒
                        //HZ = 1000 
                }    

                /*   
                 * Startup in periodic mode first.
                 */
                td->mode = TICKDEV_MODE_PERIODIC;
                // 设备运行模式 --> 周期模式

        } else {
                // 关于这个 else ,我们某天会来到这里,现在 ignore it.
                handler = td->evtdev->event_handler;
                next_event = td->evtdev->next_event;
                td->evtdev->event_handler = clockevents_handle_noop;
        }    

        td->evtdev = newdev;
        //为时钟设备指定事件设备

        /*
         * When the device is not per cpu, pin the interrupt to the
         * current cpu:
         */
        if (!cpumask_equal(newdev->cpumask, cpumask))
                irq_set_affinity(newdev->irq, cpumask);

        /*
         * When global broadcasting is active, check if the current
         * device is registered as a placeholder for broadcast mode.
         * This allows us to handle this x86 misfeature in a generic
         * way.
         */
        // check whether enable the broadcast mode,如果系统处于省电模式,而局部时钟停止工作,则会使用广播机制
        if (tick_device_uses_broadcast(newdev, cpu))
                return;

        if (td->mode == TICKDEV_MODE_PERIODIC)
                tick_setup_periodic(newdev, 0);
                // 周期模式 invoke this ......
        else
                tick_setup_oneshot(newdev, handler, next_event);
                // 单触发模式
}

 

void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
{
        tick_set_periodic_handler(dev, broadcast);
        // broadcast = 0

        /* Broadcast setup ? */
        if (!tick_device_is_functional(dev))
                return;

        if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
            !tick_broadcast_oneshot_active()) {
                clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
                // here
                // 设置成周期模式
        } else {
                unsigned long seq;
                ktime_t next;

                do {
                        seq = read_seqbegin(&xtime_lock);
                        next = tick_next_period;
                } while (read_seqretry(&xtime_lock, seq));

                clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);

                for (;;) {
                        if (!clockevents_program_event(dev, next, ktime_get()))
                                return;
                        next = ktime_add(next, tick_period);
                }
        }
}

void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
{
        if (!broadcast)
                dev->event_handler = tick_handle_periodic;
                // here
        else
                dev->event_handler = tick_handle_periodic_broadcast;
}

static inline int tick_device_is_functional(struct clock_event_device *dev)
{
        return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
}

此时 Global event_handler 的注册就接近尾声了,event_handler = tick_handle_periodic,不对啊应该是 “tick_handle_oneshot_broadcast”啊,是啊,莫急,故事远没有结束这才是一个开始 ……

故宫

2012-02-19 13:08:34 +0000

在北京快一年了,一直没有机会游览一下北京的风光,在实习即将结束之际游览一下也不枉来一次北京。

故宫一直是我想去的地方,因为它承载着很多。当游览完后有点小遗憾,说实话故宫里其实没有什么了,只剩了一个躯壳,剩下了别人拿不走的东西,感觉那些建筑承受着时间的侵蚀,独自伫立在风中,很孤单,很孤单 …… 真正我感兴趣的想看的东西根本不在这里 ……

发几张图吧:

时钟源的注册

2012-01-20 23:18:15 +0000

时钟源是啥?它与时钟中断源的有啥区别?

时钟中断源:以某个固定频率向系统发送中断请求的设备,系统执行相应中断处理程序,负责更新 jiffies 等等一系列重要的工作,是系统的脉搏。

可能的时钟中断源:PIT,HPET,Local-APIC 等

查看时钟中断源:

cat /proc/timer_list grep “Clock Event Device:”

Clock Event Device: hpet

Clock Event Device: lapic

Clock Event Device: lapic

表示系统中 global 时钟中断源是 HPET,系统中有两个 logical cpu,每个 CPU 的 local apic 也同样是时钟中断源。(关于 /proc/timer_list 的其他内容后面的文章中会有介绍)。

时钟源:不具备向系统发送中断请求的功能,但能提供更高的时间精度。简单的说就是一个提供一定精度的计时设备。gettimeofday 就是通过 timekeeper 根据当前使用的时钟源获取时间(单位微妙级)。

可能的时钟源 PIT,HPET,TSC,ACPI_PM 等

查看时钟源:

RHEL-5:

# dmesg grep “time.c” (详细内容请查看 time_init_gtod 函数)

RHEL-6:

# cat /sys/devices/system/clocksource/clocksource0/current_clocksource

hpet

# cat /sys/devices/system/clocksource/clocksource0/available_clocksource

hpet acpi_pm

Change the current clocksource

# echo “acpi_pm” > /sys/devices/system/clocksource/clocksource0/current_clocksource

 

下面进入正题,介绍一下时钟源的注册(个人理解,欢迎指正)。

`</p>

1) hpet_time_init()
    -> hpet_enable()
         -> hpet_clocksource_register()
              -> clocksource_register(&clocksource_hpet)
2) tsc_init ()
    -> init_tsc_clocksource()
         -> clocksource_register(&clocksource_tsc)

在系统初始化结束前会调用:
3) core_initcall(init_jiffies_clocksource)
    -> init_jiffies_clocksource()
        -> clocksource_register(&clocksource_jiffies)
maybe ...
4) fs_initcall(init_acpi_pm_clocksource)
    -> init_acpi_pm_clocksource 
        -> clocksource_register(&clocksource_acpi_pm)`

 

以 clocksource_register(&clocksource_tsc) 为例:

int clocksource_register(struct clocksource *cs) 
{     
        /* calculate max idle time permitted for this clocksource */
        cs->max_idle_ns = clocksource_max_deferment(cs);

        mutex_lock(&clocksource_mutex);
        // 互斥锁用来保护 clocksource_list 链表
        clocksource_enqueue(cs);
        // 将当前时钟源加入到 clocksource_list 链表中

        clocksource_select();
        // 选择时钟源
        clocksource_enqueue_watchdog(cs);
        mutex_unlock(&clocksource_mutex);
        return 0;
}     

 

static struct clocksource clocksource_tsc = {
        .name                   = "tsc",
        .rating                 = 300,
        .read                   = read_tsc,
        .resume                 = resume_tsc,
        .mask                   = CLOCKSOURCE_MASK(64),
        .shift                  = 22,
        .flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
                                  CLOCK_SOURCE_MUST_VERIFY,
#ifdef CONFIG_X86_64
        .vread                  = vread_tsc,
#endif
};

 

static void clocksource_enqueue(struct clocksource *cs) 
{
        struct list_head *entry = &clocksource_list;
        // static LIST_HEAD(clocksource_list);
        // 初始化为空 head prev 都指向自己

        struct clocksource *tmp;

        list_for_each_entry(tmp, &clocksource_list, list)
                /* Keep track of the place, where to insert */
                if (tmp->rating >= cs->rating)
                        entry = &tmp->list;
                //rating 值越大代表精度越高,放在链表前
        list_add(&cs->list, entry);
        // 现在假设当前注册的是 TSC ,因为在此之前已经注册了 hpet,tsc rating 大于 hpet 所以 tsc 应该仅靠 clocksource_list 链表头部
        // 插入到双向循环链表中 hpet --> tsc --> head(clocksource_list)
}

 

static void clocksource_select(void)
{
        struct clocksource *best, *cs;

        if (!finished_booting || list_empty(&clocksource_list))
                return;
        // 此时 finished_booting = 0,所以 closksource_select 函数直接返回。在系统初始化快结束时调用 clocksource_done_booting() 会设置finished_booting = 1,紧接着调用 clocksource_select() 进行真正的时钟源选择。

        ......

}

 

到现在为止,时钟源基本上都已经注册好了,来张图(HEAD 为 clocksource_list):

 

最终调用:

fs_initcall(clocksource_done_booting)

 

static int __init clocksource_done_booting(void)
{
        finished_booting = 1;
        // 表示初始化完成

        /*      
         * Run the watchdog first to eliminate unstable clock sources
         */
        clocksource_watchdog_kthread(NULL);

        mutex_lock(&clocksource_mutex);

        clocksource_select();
        // 再次调用,选择最好的时钟源

        mutex_unlock(&clocksource_mutex);
        return 0;
}

 

static void clocksource_select(void)
{
        struct clocksource *best, *cs;

        if (!finished_booting || list_empty(&clocksource_list))
                return;

        /* First clocksource on the list has the best rating. */
        best = list_first_entry(&clocksource_list, struct clocksource, list);
        // 如上图中的链表中第一个就是最好的时钟源

        /* Check for the override clocksource. */
        list_for_each_entry(cs, &clocksource_list, list) {
                if (strcmp(cs->name, override_name) != 0)
                        continue;
                /*
                 * Check to make sure we don't switch to a non-highres
                 * capable clocksource if the tick code is in oneshot
                 * mode (highres or nohz)
                 */
                if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
                    tick_oneshot_mode_active()) {
                        /* Override clocksource cannot be used. */
                        printk(KERN_WARNING "Override clocksource %s is not "
                               "HRT compatible. Cannot switch while in "
                               "HRT/NOHZ mode\n", cs->name);
                        override_name[0] = 0;
                } else
                        /* Override clocksource can be used. */
                        best = cs;
                break;
        }
        if (curr_clocksource != best) {
                printk(KERN_INFO "Switching to clocksource %s\n", best->name);
                // Switching to clocksource tsc log from dmesg .

                curr_clocksource = best;
                // 保存当前使用的时钟源

                timekeeping_notify(curr_clocksource);
                // Install a new clocksource
        }
}

 

void timekeeping_notify(struct clocksource *clock)
{     
        if (timekeeper.clock == clock)
                return;
        // clock = &clocksource_tsc .
        // 此时 clock = &clocksource_jiffies . timekeeping_init() 中初始化.
     
        stop_machine(change_clocksource, clock, NULL);
        tick_clock_notify();
}

 

感觉接下来执行的函数应该为 change_clocksource,这中间的代码没有太细看始终不太明白怎么执行到这个函数的(知道的同学麻烦告诉一下~~),感觉这期间应该做一些基本的工作毕竟时钟源要切换了!

static int change_clocksource(void *data)
{
        struct clocksource *new, *old;

        new = (struct clocksource *) data;
        // new = $clocksource_tsc .

        timekeeping_forward_now();
        // 更新时间

        if (!new->enable || new->enable(new) == 0) {
                // new->enable == NULL

                old = timekeeper.clock;
                timekeeper_setup_internals(new);
                // 设置新的时钟源 tsc --> timekeeper.

                if (old->disable)
                        old->disable(old);
        }
        return 0;
}

 

static void timekeeping_forward_now(void)
{
        cycle_t cycle_now, cycle_delta;
        struct clocksource *clock;
        s64 nsec;

        clock = timekeeper.clock;
        // clock = &clocksource_jiffies 

        cycle_now = clock->read(clock);
        // 返回 jiffies

        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
        // 经过的时间(jiffies 数)

        clock->cycle_last = cycle_now;

        nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
                                  timekeeper.shift);
        // 转换成 纳秒 表示再切换时钟源之前最后一次更新 xtime

        /* If arch requires, add in gettimeoffset() */
        nsec += arch_gettimeoffset();
        // 空函数

        timespec_add_ns(&xtime, nsec);
        // 加到 xtime 中去 

        nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
        timespec_add_ns(&raw_time, nsec);
}

 

static void timekeeper_setup_internals(struct clocksource *clock)
{
        cycle_t interval;
        u64 tmp;

        timekeeper.clock = clock;
        // 设置时钟源 time_keeper gettimeofday 会通过 time_keeper layer 获取 wall time

        clock->cycle_last = clock->read(clock);
        // 读取时钟源获取当前值

        /* Do the ns -> cycle conversion first, using original mult */
        tmp = NTP_INTERVAL_LENGTH;
        tmp <<= clock->shift;
        tmp += clock->mult/2;
        do_div(tmp, clock->mult);
        if (tmp == 0)
                tmp = 1;

        interval = (cycle_t) tmp;
        timekeeper.cycle_interval = interval;

        /* Go back from cycles -> shifted ns */
        timekeeper.xtime_interval = (u64) interval * clock->mult;
        timekeeper.raw_interval =
                ((u64) interval * clock->mult) >> clock->shift;

        timekeeper.xtime_nsec = 0;
        timekeeper.shift = clock->shift;

        timekeeper.ntp_error = 0;
        timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;

        /*
         * The timekeeper keeps its own mult values for the currently
         * active clocksource. These value will be adjusted via NTP
         * to counteract clock drifting.
         */
        timekeeper.mult = clock->mult;
}

 

void tick_clock_notify(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
}

 

时钟源的注册就说到这里,接下来就是时钟事件设备啦!

系统调用 gettimeofday 分析

2012-01-05 21:37:59 +0000

背景:

因为需要所以最近总会写一些 timer 相关的程序,既然是和 timer 相关的那少不了调用 gettimeofday system call。这个函数很有名,在某个版本的内核中创建多个线程 invoke gettimeofday,此时 kernel timer 会 backwards。带着各种疑问我决定阅读 timer 相关内核代码。个人认为 timer 部分不算太好理解,涉及到东西很多(中断,进程调度,CFS, 还有那可恶的 SMP 等等等等),毕竟它是系统的心跳,虽然以前接触点 timer 相关的东西但是我也不太确定能把这部分代码理解透彻。总之 do my best !很喜欢一句歌词:出发了不要问那路在哪 ……

gettimeofday 就是获取 wall time。

用户态调用 gettimefoday,int 0X80 —> 软中断 -> IDT 表中断描述符 -> 段选择子 -> GDT 表段描述符 … 执行中断处理程序 -> 最终执行 sys_gettimeofday.(中间还有什么特权级切换啊,等等不再赘述)

sys_gettimeofday 是内核的入口函数可是我费了很大力气才找到,大概是这个样子:

SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
                struct timezone __user *, tz)
{
        // tv 和 tz 都是来自用户空间,tv 指向的结构保存获得的当前秒数和微妙数,tz 通常那为 NULL 
        if (likely(tv != NULL)) {
                struct timeval ktv;
                do_gettimeofday(&ktv);
                if (copy_to_user(tv, &ktv, sizeof(ktv)))
                        return -EFAULT;
        }
        if (unlikely(tz != NULL)) {
                if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
                        return -EFAULT;
        }
        return 0;
}

为什么非得弄个宏 SYSCALL_DEFINE2,直接 sys_gettimeofday 不行吗? 搞得人家很不爽!

 

核心就是下面的函数:

void do_gettimeofday(struct timeval *tv)
{
        struct timespec now;

        getnstimeofday(&now);
        // now 中返回当前时间 秒数以及纳秒数

        tv->tv_sec = now.tv_sec;
        // 秒数
        tv->tv_usec = now.tv_nsec/1000;
        // now.tv_nsec 不足一秒的纳秒数 转换为微秒数.
}

 

CONFIG_GENERIC_TIME=y

// rhel6 中被定义,本文基于 2.6.32 以上版本内核

 

struct timeval {
        __kernel_time_t         tv_sec;         /* seconds */
        __kernel_suseconds_t    tv_usec;        /* microseconds */
};

 

void getnstimeofday(struct timespec *ts)
{
        unsigned long seq;
        s64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqbegin(&xtime_lock);
                // 加读锁,如果此时已经加上的写锁,则自旋在这里

                *ts = xtime;
                // 此时很有可能更新 xtime.(时钟中断处理程序)

                nsecs = timekeeping_get_ns();

                /* If arch requires, add in gettimeoffset() */
                nsecs += arch_gettimeoffset(); 
                // NULL function ?

        } while (read_seqretry(&xtime_lock, seq));
        // 顺序锁,判断是否重读. 因为此时可能发生时钟中断更新 xtime .这样读出来的值就不准确需要重读.

        timespec_add_ns(ts, nsecs);
        // add nsecs 到 struct timespec 结构.
}

 

这里有三种情况:

1 循环只执行一遍,准确获得当前时间

2 顺利加上读锁,但是与此同时时钟中断到来需要更新 xtime 加上写锁(有更高的优先级),然后 read_seqretry 返回 1 表示时间已经不准却需要重新读取,再次循环读取。以此类推~

3 读锁阻塞,因为此时已经加上了写锁(更新 xtime),等到写锁释放后便可读取.

 

`</p>

struct timespec {
        time_t  tv_sec;         /* seconds */
        long    tv_nsec;        /* nanoseconds */
};`

 

struct timespec xtime __attribute__ ((aligned (16)));

 

__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);

#define DEFINE_SEQLOCK(x) \
                seqlock_t x = __SEQLOCK_UNLOCKED(x)                                     

typedef struct {
        unsigned sequence;
        spinlock_t lock;
} seqlock_t;
//记录了写着进程访问临界资源的过程 初始化为 0 写时 +1,解除写锁时再 +1 ,该直为奇数时处于写>锁定态
//读直接访问不需要加锁

#define __SEQLOCK_UNLOCKED(lockname) \
                 { 0, __SPIN_LOCK_UNLOCKED(lockname) }

# define __SPIN_LOCK_UNLOCKED(lockname) \
        (spinlock_t)    {       .raw_lock = __RAW_SPIN_LOCK_UNLOCKED,   \
                                SPIN_DEP_MAP_INIT(lockname) }

 

static __always_inline unsigned read_seqbegin(const seqlock_t *sl)
{
        unsigned ret;

repeat:
        ret = sl->sequence;
        smp_rmb();
        if (unlikely(ret & 1)) {
                // 如果当前值为奇数,表示已经加上写锁则等待直到写锁释放 ......
                cpu_relax();
                goto repeat;
        }

        return ret;
}

static __always_inline int read_seqretry(const seqlock_t *sl, unsigned start)
{
        smp_rmb();

        return (sl->sequence != start);
        // unlikely -> 如果此时读取的值与最开始读取的值不同则表示在这期间加了写锁更新了 xtime,返回 1
        // likely   -> 如果相等返回 0
}

 

下个比较重要的函数:

static inline s64 timekeeping_get_ns(void)
{
        cycle_t cycle_now, cycle_delta;
        struct clocksource *clock;

        clock = timekeeper.clock;
        // 时钟源 PIT/HPET/TSC
        // timekeeper 是什么 ? 在哪里初始化?先卖个关子,后面的文章会继续介绍。简单说下,timekeeper 是 gettimeofday 等等系统调用的接口,timekeeper 下面是 clocksource 

        cycle_now = clock->read(clock);
        // 读取时钟周期的当前计数值 假设为 TSC --> read_tsc()

        /* calculate the delta since the last update_wall_time: */
        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
        // clock->cycle_last 应该由时钟中断处理程序负责更新.

        /* return delta convert to nanoseconds using ntp adjusted mult. */
        return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
                                  timekeeper.shift);
        //转换成纳秒
}

timekeeper.mult timekeeper.shift 细节之后会介绍

 

static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift)
{
                return ((u64) cycles * mult) >> shift;
}

 

static __always_inline void timespec_add_ns(struct timespec *a, u64 ns)
{
                a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns);
                // 返回 秒数 一般情况下返回 0
                a->tv_nsec = ns;
                // 不足 1s 的纳秒数 (此值会一直叠加)
}

 

最后一个函数:

static __always_inline u32
__iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
{
        // dividend = a->tv_nsec + ns
        u32 ret = 0;

        while (dividend >= divisor) {
                // 如果 纳秒大于一秒 NSEC_PER_SEC 

                /* The following asm() prevents the compiler from
                   optimising this loop into a modulo operation.  */
                asm("" : "+rm"(dividend));

                dividend -= divisor;
                // 减去一秒

                ret++;
                // 秒数加1
        }

        *remainder = dividend;

        return ret;
}

 

总结:

不知不觉中,gettimeofday 已经介绍完了,但是这里面还有几个细节没有讲述,比如 时钟源,时钟源的注册 timekeeping 等等.后面的文章中会陆续介绍,说实话 gettimeofday 此函数实现不算很难,我觉得难理解的在后面,就当它是个开胃小菜吧!