任务调度

一. 前言

在前文中，我们分析了内核中进程和线程的统一结构体task_struct，并分析进程、线程的创建和派生的过程。在本文中，我们会对任务间调度进行详细剖析，了解其原理和整个执行过程。由此，进程、线程部分的大体框架就算是介绍完了。本节主要分为三个部分：Linux内核中常见的调度策略，调度的基本结构体以及调度发生的整个流程。下面将详细展开说明。

二. 调度策略

Linux的调度策略主要分为实时任务和普通任务。实时任务需求尽快返回结果，而普通任务则没有较高的要求。在前文中我们提到了task_struct中调度策略相应的变量为policy，调度优先级有prio, static_prio, normal_prio, rt_priority几个。优先级其实就是一个数值，对于实时进程，优先级的范围是 0～99；对于普通进程，优先级的范围是 100～139。数值越小，优先级越高。

2.1 实时调度策略

实时调度策略主要包括以下几种

SCHED_FIFO：先来先出型策略，顾名思义相同优先级的情况下先到先得
SCHED_RR：轮询策略，注重公平性，相同优先级的任务会使用相同的时间片轮流执行
SCHED_DEADLINE：根据任务结束时间来进行调度，即将结束的拥有较高的优先级

2.2 普通调度策略

普通调度策略主要包括以下几种

SCHED_NORMAL：普通任务
SCHED_BATCH：后台任务，优先级较低
SCHED_IDLE：空闲时间才会跑的任务
CFS：完全公平调度策略，较为特殊的一种策略。CFS 会为每一个任务安排一个虚拟运行时间 vruntime。如果一个任务在运行，随着一个个 CPU时钟tick 的到来，任务的 vruntime 将不断增大，而没有得到执行的任务的 vruntime 不变。由此，当调度的时候，vruntime较小的就拥有较高的优先级。 vruntime的实际计算方式和权重相关，由此保证了优先级高的按比例拥有更多的执行时间，从而达到完全公平。

三. 调度相关的架构体

首先，我们需要一个结构体去执行调度策略，即sched_class。该类有几种实现

stop_sched_class 优先级最高的任务会使用这种策略，会中断所有其他线程，且不会被其他任务打断；
dl_sched_class 就对应上面的 deadline 调度策略；
rt_sched_class 就对应 RR 算法或者 FIFO 算法的调度策略，具体调度策略由进程的 task_struct->policy 指定；
fair_sched_class 就是普通进程的调度策略；
idle_sched_class 就是空闲进程的调度策略。

其次，我们需要一个调度结构体来集合调度信息，用于调度，即sched_entity，主要有

struct sched_entity se：普通任务调度实体
struct sched_rt_entity rt：实时调度实体
struct sched_dl_entity dl：DEADLINE调度实体

普通任务调度实体源码如下，这里面包含了 vruntime 和权重 load_weight，以及对于运行时间的统计。

struct sched_entity {
    /* For load-balancing: */
    struct load_weight      load;
    unsigned long           runnable_weight;
    struct rb_node          run_node;
    struct list_head        group_node;
    unsigned int            on_rq;
    u64             exec_start;
    u64             sum_exec_runtime;
    u64             vruntime;
    u64             prev_sum_exec_runtime;
    u64             nr_migrations;
    struct sched_statistics     statistics;
#ifdef CONFIG_FAIR_GROUP_SCHED
    int             depth;
    struct sched_entity     *parent;
    /* rq on which this entity is (to be) queued: */
    struct cfs_rq           *cfs_rq;
    /* rq "owned" by this entity/group: */
    struct cfs_rq           *my_q;
#endif
#ifdef CONFIG_SMP
    /*
     * Per entity load average tracking.
     *
     * Put into separate cache line so it does not
     * collide with read-mostly values above.
     */
    struct sched_avg        avg;
#endif
};

在调度时，多个任务调度实体会首先区分是实时任务还是普通任务，然后通过以时间为顺序的红黑树结构组合起来，vruntime 最小的在树的左侧，vruntime最多的在树的右侧。以CFS策略为例，则会选择红黑树最左边的叶子节点作为下一个将获得 CPU 的任务。而这颗红黑树，我们称之为运行时队列（run queue），即struct rq。

/*
 * This is the main, per-CPU runqueue data structure.
 *
 * Locking rule: those places that want to lock multiple runqueues
 * (such as the load balancing or the thread migration code), lock
 * acquire operations must be ordered by ascending &runqueue.
 */
struct rq {
    /* runqueue lock: */
    raw_spinlock_t      lock;
    /*
     * nr_running and cpu_load should be in the same cacheline because
     * remote CPUs use both these fields when doing load calculation.
     */
    unsigned int        nr_running;
......
    #define CPU_LOAD_IDX_MAX 5
    unsigned long       cpu_load[CPU_LOAD_IDX_MAX];
......
    /* capture load from *all* tasks on this CPU: */
    struct load_weight  load;
    unsigned long       nr_load_updates;
    u64         nr_switches;
    struct cfs_rq       cfs;
    struct rt_rq        rt;
    struct dl_rq        dl;
......
    /*
     * This is part of a global counter where only the total sum
     * over all CPUs matters. A task can increase this counter on
     * one CPU and if it got migrated afterwards it may decrease
     * it on another CPU. Always updated under the runqueue lock:
     */
    unsigned long       nr_uninterruptible;
    struct task_struct  *curr;
    struct task_struct  *idle;
    struct task_struct  *stop;
    unsigned long       next_balance;
    struct mm_struct    *prev_mm;
    unsigned int        clock_update_flags;
    u64         clock;
    /* Ensure that all clocks are in the same cache line */
    u64         clock_task ____cacheline_aligned;
    u64         clock_pelt;
    unsigned long       lost_idle_time;
    atomic_t        nr_iowait;
......
    /* calc_load related fields */
    unsigned long       calc_load_update;
    long            calc_load_active;
......
};

其中包含结构体cfs_rq，其定义如下，主要是CFS调度相关的结构体，主要有权值相关变量、vruntime相关变量以及红黑树指针，其中结构体rb_root_cached即为红黑树的节点

/* CFS-related fields in a runqueue */
struct cfs_rq {
    struct load_weight  load;
    unsigned long       runnable_weight;
    unsigned int        nr_running;
    unsigned int        h_nr_running;
    u64         exec_clock;
    u64         min_vruntime;
#ifndef CONFIG_64BIT
    u64         min_vruntime_copy;
#endif
    struct rb_root_cached   tasks_timeline;
    /*
     * 'curr' points to currently running entity on this cfs_rq.
     * It is set to NULL otherwise (i.e when none are currently running).
     */
    struct sched_entity *curr;
    struct sched_entity *next;
    struct sched_entity *last;
    struct sched_entity *skip;
......
};

对结构体dl_rq有类似的定义，运行队列由红黑树结构体构成，并按照deadline策略进行管理

/* Deadline class' related fields in a runqueue */
struct dl_rq {
    /* runqueue is an rbtree, ordered by deadline */
    struct rb_root_cached   root;
    unsigned long       dl_nr_running;
#ifdef CONFIG_SMP
    /*
     * Deadline values of the currently executing and the
     * earliest ready task on this rq. Caching these facilitates
     * the decision whether or not a ready but not running task
     * should migrate somewhere else.
     */
    struct {
        u64     curr;
        u64     next;
    } earliest_dl;
    unsigned long       dl_nr_migratory;
    int         overloaded;
    /*
     * Tasks on this rq that can be pushed away. They are kept in
     * an rb-tree, ordered by tasks' deadlines, with caching
     * of the leftmost (earliest deadline) element.
     */
    struct rb_root_cached   pushable_dl_tasks_root;
#else
    struct dl_bw        dl_bw;
#endif
    /*
     * "Active utilization" for this runqueue: increased when a
     * task wakes up (becomes TASK_RUNNING) and decreased when a
     * task blocks
     */
    u64         running_bw;
    /*
     * Utilization of the tasks "assigned" to this runqueue (including
     * the tasks that are in runqueue and the tasks that executed on this
     * CPU and blocked). Increased when a task moves to this runqueue, and
     * decreased when the task moves away (migrates, changes scheduling
     * policy, or terminates).
     * This is needed to compute the "inactive utilization" for the
     * runqueue (inactive utilization = this_bw - running_bw).
     */
    u64         this_bw;
    u64         extra_bw;
    /*
     * Inverse of the fraction of CPU utilization that can be reclaimed
     * by the GRUB algorithm.
     */
    u64         bw_ratio;
};

对于实施队列相应的rt_rq则有所不同，并没有用红黑树实现。

/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
    struct rt_prio_array    active;
    unsigned int        rt_nr_running;
    unsigned int        rr_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
    struct {
        int     curr; /* highest queued rt task prio */
#ifdef CONFIG_SMP
        int     next; /* next highest */
#endif
    } highest_prio;
#endif
#ifdef CONFIG_SMP
    unsigned long       rt_nr_migratory;
    unsigned long       rt_nr_total;
    int         overloaded;
    struct plist_head   pushable_tasks;
#endif /* CONFIG_SMP */
    int         rt_queued;
    int         rt_throttled;
    u64         rt_time;
    u64         rt_runtime;
    /* Nests inside the rq lock: */
    raw_spinlock_t      rt_runtime_lock;
#ifdef CONFIG_RT_GROUP_SCHED
    unsigned long       rt_nr_boosted;
    struct rq       *rq;
    struct task_group   *tg;
#endif
};

下面再看看调度类sched_class，该类以函数指针的形式定义了诸多队列操作，如

enqueue_task 向就绪队列中添加一个任务，当某个任务进入可运行状态时，调用这个函数；
dequeue_task 将一个任务从就绪队列中删除；
yield_task将主动放弃CPU；
yield_to_task主动放弃CPU并执行指定的task_struct；
check_preempt_curr检查当前任务是否可被强占；
pick_next_task 选择接下来要运行的任务；
put_prev_task 用另一个进程代替当前运行的任务；
set_curr_task 用于修改调度策略；
task_tick 每次周期性时钟到的时候，这个函数被调用，可能触发调度。
task_dead:进程结束时调用
switched_from、switched_to:进程改变调度器时使用
prio_changed:改变进程优先级

struct sched_class {
    const struct sched_class *next;
    void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
    void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
    void (*yield_task)   (struct rq *rq);
    bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
    void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
    /*
     * It is the responsibility of the pick_next_task() method that will
     * return the next task to call put_prev_task() on the @prev task or
     * something equivalent.
     *
     * May return RETRY_TASK when it finds a higher prio class has runnable
     * tasks.
     */
    struct task_struct * (*pick_next_task)(struct rq *rq,
                           struct task_struct *prev,
                           struct rq_flags *rf);
    void (*put_prev_task)(struct rq *rq, struct task_struct *p);
......
    void (*set_curr_task)(struct rq *rq);
    void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
    void (*task_fork)(struct task_struct *p);
    void (*task_dead)(struct task_struct *p);
    /*
     * The switched_from() call is allowed to drop rq->lock, therefore we
     * cannot assume the switched_from/switched_to pair is serliazed by
     * rq->lock. They are however serialized by p->pi_lock.
     */
    void (*switched_from)(struct rq *this_rq, struct task_struct *task);
    void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
    void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
                  int oldprio);
    unsigned int (*get_rr_interval)(struct rq *rq,
                    struct task_struct *task);
    void (*update_curr)(struct rq *rq);
#define TASK_SET_GROUP      0
#define TASK_MOVE_GROUP     1
......
};

调度类分为下面几种：

extern const struct sched_class stop_sched_class;
extern const struct sched_class dl_sched_class;
extern const struct sched_class rt_sched_class;
extern const struct sched_class fair_sched_class;
extern const struct sched_class idle_sched_class;

队列操作中函数指针指向不同策略队列的实际执行函数函数，在linux/kernel/sched/目录下，fair.c、idle.c、rt.c等文件对不同类型的策略实现了不同的函数，如fair.c中定义了

/*
 * All the scheduling class methods:
 */
const struct sched_class fair_sched_class = {
    .next               = &idle_sched_class,
    .enqueue_task       = enqueue_task_fair,
    .dequeue_task       = dequeue_task_fair,
    .yield_task         = yield_task_fair,
    .yield_to_task      = yield_to_task_fair,
    .check_preempt_curr  = check_preempt_wakeup,
    .pick_next_task      = pick_next_task_fair,
    .put_prev_task       = put_prev_task_fair,
......
    .set_curr_task       = set_curr_task_fair,
    .task_tick          = task_tick_fair,
    .task_fork          = task_fork_fair,
    .prio_changed       = prio_changed_fair,
    .switched_from      = switched_from_fair,
    .switched_to        = switched_to_fair,
    .get_rr_interval    = get_rr_interval_fair,
    .update_curr        = update_curr_fair,
......
};

以选择下一个任务为例，CFS对应的是pick_next_task_fair，而rt_rq对应的则是pick_next_task_rt，等等。

由此，我们来总结一下：

每个CPU都有一个struct rq结构体，里面会有着cfs_rq, rt_rq等一系列队列
每个队列由一个红黑树组织，红黑树里每一个节点为一个任务实体sched_entity
每一个任务实体sched_entity对应于一个任务task_struct
在task_struct中对应的sched_class会根据不同策略申明不同的对应处理函数，处理实际的调度工作

四. 调度流程

有了上述的基本策略和基本调度结构体，我们可以形成大致的骨架，下面就是需要核心的调度流程将其拼凑成一个整体，实现调度系统。调度分为两种，主动调度和抢占式调度。

主动调度即任务执行一定时间以后主动让出CPU，通过调度策略选择合适的下一个任务执行。
抢占式调度即任务执行中收到了其他任务的中断，由此停止执行并切换至下一个任务。

4.1 主动调度

说到调用，逃不过核心函数schedule()。其中sched_submit_work()函数完成当前任务的收尾工作，以避免出现如死锁或者IO中断等情况。之后首先禁止抢占式调度的发生，然后调用__schedule()函数完成调度，之后重新打开抢占式调度，如果需要重新调度则会一直重复该过程，否则结束函数。

asmlinkage __visible void __sched schedule(void)
{
    struct task_struct *tsk = current;
    sched_submit_work(tsk);
    do {
        preempt_disable();
        __schedule(false);
        sched_preempt_enable_no_resched();
    } while (need_resched());
}
EXPORT_SYMBOL(schedule);

而__schedule()函数则是实际的核心调度函数，该函数主要操作包括选取下一进程和进行上下文切换，而上下文切换又包括用户态空间切换和内核态的切换。具体的解释可以参照英文源码注释以及中文对各个步骤的注释。

/*
 * __schedule() is the main scheduler function.
 * The main means of driving the scheduler and thus entering this function are:
 *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
 *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
 *      paths. For example, see arch/x86/entry_64.S.
 *      To drive preemption between tasks, the scheduler sets the flag in timer
 *      interrupt handler scheduler_tick().
 *   3. Wakeups don't really cause entry into schedule(). They add a
 *      task to the run-queue and that's it.
 *      Now, if the new task added to the run-queue preempts the current
 *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
 *      called on the nearest possible occasion:
 *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
 *         - in syscall or exception context, at the next outmost
 *           preempt_enable(). (this might be as soon as the wake_up()'s
 *           spin_unlock()!)
 *         - in IRQ context, return from interrupt-handler to
 *           preemptible context
 *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
 *         then at the next:
 *          - cond_resched() call
 *          - explicit schedule() call
 *          - return from syscall or exception to user-space
 *          - return from interrupt-handler to user-space
 * WARNING: must be called with preemption disabled!
 */
static void __sched notrace __schedule(bool preempt)
{
    struct task_struct *prev, *next;
    unsigned long *switch_count;
    struct rq_flags rf;
    struct rq *rq;
    int cpu;
    
    //从当前的CPU中取出任务队列rq，prev赋值为当前任务
    cpu = smp_processor_id();
    rq = cpu_rq(cpu);
    prev = rq->curr;
    
    //检测当前任务是否可以调度
    schedule_debug(prev);
    if (sched_feat(HRTICK))
        hrtick_clear(rq);
    
    //禁止中断，RCU抢占关闭，队列加锁，SMP加锁
    local_irq_disable();
    rcu_note_context_switch(preempt);
    /*
     * Make sure that signal_pending_state()->signal_pending() below
     * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
     * done by the caller to avoid the race with signal_wake_up().
     *
     * The membarrier system call requires a full memory barrier
     * after coming from user-space, before storing to rq->curr.
     */
    rq_lock(rq, &rf);
    smp_mb__after_spinlock();
    
    /* Promote REQ to ACT */
    rq->clock_update_flags <<= 1;
    update_rq_clock(rq);
    switch_count = &prev->nivcsw;
    
    if (!preempt && prev->state) {
        //不可中断的任务则继续执行
        if (signal_pending_state(prev->state, prev)) {
            prev->state = TASK_RUNNING;
        } else {
            //当前任务从队列rq中出队，on_rq设置为0，如果存在I/O未完成则延时完成
            deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
            prev->on_rq = 0;
            if (prev->in_iowait) {
                atomic_inc(&rq->nr_iowait);
                delayacct_blkio_start();
            }
            /* 唤醒睡眠进程
             * If a worker went to sleep, notify and ask workqueue
             * whether it wants to wake up a task to maintain
             * concurrency.
             */
            if (prev->flags & PF_WQ_WORKER) {
                struct task_struct *to_wakeup;
                to_wakeup = wq_worker_sleeping(prev);
                if (to_wakeup)
                    try_to_wake_up_local(to_wakeup, &rf);
            }
        }
        switch_count = &prev->nvcsw;
    }
    
    // 调用pick_next_task获取下一个任务，赋值给next
    next = pick_next_task(rq, prev, &rf);
    clear_tsk_need_resched(prev);
    clear_preempt_need_resched();
    
    // 如果产生了任务切换，则需要切换上下文
    if (likely(prev != next)) {
        rq->nr_switches++;
        rq->curr = next;
        /*
         * The membarrier system call requires each architecture
         * to have a full memory barrier after updating
         * rq->curr, before returning to user-space.
         *
         * Here are the schemes providing that barrier on the
         * various architectures:
         * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
         *   switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
         * - finish_lock_switch() for weakly-ordered
         *   architectures where spin_unlock is a full barrier,
         * - switch_to() for arm64 (weakly-ordered, spin_unlock
         *   is a RELEASE barrier),
         */
        ++*switch_count;
        trace_sched_switch(preempt, prev, next);
        /* Also unlocks the rq: */
        rq = context_switch(rq, prev, next, &rf);
    } else {
        // 清除标记位，重开中断
        rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
        rq_unlock_irq(rq, &rf);
    }
    //队列自平衡：红黑树平衡操作
    balance_callback(rq);
}

其中核心函数是获取下一个任务的pick_next_task()以及上下文切换的context_switch()，下面详细展开剖析。首先看看pick_next_task()，该函数会根据调度策略分类，调用该类对应的调度函数选择下一个任务实体。根据前文分析我们知道，最终是在不同的红黑树上选择最左节点作为下一个任务实体并返回。

/*
 * Pick up the highest-prio task:
 */
static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
    const struct sched_class *class;
    struct task_struct *p;
    /* 这里做了一个优化：如果是普通调度策略则直接调用fair_sched_class中的pick_next_task
     * Optimization: we know that if all tasks are in the fair class we can
     * call that function directly, but only if the @prev task wasn't of a
     * higher scheduling class, because otherwise those loose the
     * opportunity to pull in more work from other CPUs.
     */
    if (likely((prev->sched_class == &idle_sched_class ||
            prev->sched_class == &fair_sched_class) &&
           rq->nr_running == rq->cfs.h_nr_running)) {
        p = fair_sched_class.pick_next_task(rq, prev, rf);
        if (unlikely(p == RETRY_TASK))
            goto again;
        /* Assumes fair_sched_class->next == idle_sched_class */
        if (unlikely(!p))
            p = idle_sched_class.pick_next_task(rq, prev, rf);
        return p;
    }
again:
    //依次调用类中的选择函数，如果正确选择到下一个任务则返回
    for_each_class(class) {
        p = class->pick_next_task(rq, prev, rf);
        if (p) {
            if (unlikely(p == RETRY_TASK))
                goto again;
            return p;
        }
    }
    /* The idle class should always have a runnable task: */
    BUG();
}

下面来看看上下文切换。上下文切换主要干两件事情，一是切换任务空间，也即虚拟内存；二是切换寄存器和 CPU 上下文。关于任务空间的切换放在内存部分的文章中详细介绍，这里先按下不表，通过任务空间切换实际完成了用户态的上下文切换工作。下面我们重点看一下内核态切换，即寄存器和CPU上下文的切换。

/*
 * context_switch - switch to the new MM and the new thread's register state.
 */
static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
           struct task_struct *next, struct rq_flags *rf)
{
    struct mm_struct *mm, *oldmm;
    prepare_task_switch(rq, prev, next);
    mm = next->mm;
    oldmm = prev->active_mm;
    /*
     * For paravirt, this is coupled with an exit in switch_to to
     * combine the page table reload and the switch backend into
     * one hypercall.
     */
    arch_start_context_switch(prev);
    /*
     * If mm is non-NULL, we pass through switch_mm(). If mm is
     * NULL, we will pass through mmdrop() in finish_task_switch().
     * Both of these contain the full memory barrier required by
     * membarrier after storing to rq->curr, before returning to
     * user-space.
     */
    if (!mm) {
        next->active_mm = oldmm;
        mmgrab(oldmm);
        enter_lazy_tlb(oldmm, next);
    } else
        switch_mm_irqs_off(oldmm, mm, next);
    if (!prev->mm) {
        prev->active_mm = NULL;
        rq->prev_mm = oldmm;
    }
    rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
    prepare_lock_switch(rq, next, rf);
    /* Here we just switch the register state and the stack. */
    switch_to(prev, next, prev);
    //barrier 语句是一个编译器指令，用于保证 switch_to 和 finish_task_switch 的执行顺序不会因为编译阶段优化而改变
    barrier();
    return finish_task_switch(prev);
}

switch_to()就是寄存器和栈的切换，它调用到了 __switch_to_asm。这是一段汇编代码，主要用于栈的切换，其中32位使用esp作为栈顶指针，64位使用rsp，其他部分代码一致。通过该段汇编代码我们完成了栈顶指针的切换，并调用__switch_to完成最终TSS的切换。注意switch_to中其实是有三个变量，分别是prev, next, last，而实际在使用时，我们会对last也赋值为prev。这里的设计意图需要结合一个例子来说明。假设有ABC三个任务，从A调度到B，B到C，最后C回到A，我们假设仅保存prev和next，则流程如下

A保存内核栈和寄存器，切换至B，此时prev = A, next = B，该状态会保存在栈里，等下次调用A的时候再恢复。然后调用B的finish_task_switch()继续执行下去，返回B的队列rq，
B保存内核栈和寄存器，切换至C
C保存内核栈和寄存器，切换至A。A从barrier()开始运行，而A从步骤1中保存的prev = A, next = B则完美的避开了C，丢失了C的信息。因此last指针的重要性就出现了。在执行完__switch_to_asm后，A的内核栈和寄存器重新覆盖了prev和next，但是我们通过返回值提供了C的内存地址，保存在last中，在finish_task_switch中完成清理工作。

#define switch_to(prev, next, last)			      \
do {									       \
    prepare_switch_to(next);					\
                                               \
    ((last) = __switch_to_asm((prev), (next)));	  \
} while (0)

/*
 * %eax: prev task
 * %edx: next task
 */
ENTRY(__switch_to_asm)
......
  /* switch stack */
  movl  %esp, TASK_threadsp(%eax)
  movl  TASK_threadsp(%edx), %esp
......
  jmp  __switch_to
END(__switch_to_asm)

最终调用__switch_to()函数。该函数中涉及到一个结构体TSS(Task State Segment)，该结构体存放了所有的寄存器。另外还有一个特殊的寄存器TR（Task Register）会指向TSS，我们通过更改TR的值，会触发硬件保存CPU所有寄存器在当前TSS，并从新的TSS读取寄存器的值加载入CPU，从而完成一次硬中断带来的上下文切换工作。系统初始化的时候，会调用 cpu_init()给每一个 CPU 关联一个 TSS，然后将 TR 指向这个 TSS，然后在操作系统的运行过程中，TR 就不切换了，永远指向这个 TSS。当修改TR的值得时候，则为任务调度。

/*
 *	switch_to(x,y) should switch tasks from x to y.
 *
 * We fsave/fwait so that an exception goes off at the right time
 * (as a call from the fsave or fwait in effect) rather than to
 * the wrong process. Lazy FP saving no longer makes any sense
 * with modern CPU's, and this simplifies a lot of things (SMP
 * and UP become the same).
 *
 * NOTE! We used to use the x86 hardware context switching. The
 * reason for not using it any more becomes apparent when you
 * try to recover gracefully from saved state that is no longer
 * valid (stale segment register values in particular). With the
 * hardware task-switch, there is no way to fix up bad state in
 * a reasonable manner.
 *
 * The fact that Intel documents the hardware task-switching to
 * be slow is a fairly red herring - this code is not noticeably
 * faster. However, there _is_ some room for improvement here,
 * so the performance issues may eventually be a valid point.
 * More important, however, is the fact that this allows us much
 * more flexibility.
 *
 * The return value (in %ax) will be the "prev" task after
 * the task-switch, and shows up in ret_from_fork in entry.S,
 * for example.
 */
__visible __notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
    struct thread_struct *prev = &prev_p->thread,
                 *next = &next_p->thread;
    struct fpu *prev_fpu = &prev->fpu;
    struct fpu *next_fpu = &next->fpu;
    int cpu = smp_processor_id();
    /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
    switch_fpu_prepare(prev_fpu, cpu);
    /*
     * Save away %gs. No need to save %fs, as it was saved on the
     * stack on entry.  No need to save %es and %ds, as those are
     * always kernel segments while inside the kernel.  Doing this
     * before setting the new TLS descriptors avoids the situation
     * where we temporarily have non-reloadable segments in %fs
     * and %gs.  This could be an issue if the NMI handler ever
     * used %fs or %gs (it does not today), or if the kernel is
     * running inside of a hypervisor layer.
     */
    lazy_save_gs(prev->gs);
    /*
     * Load the per-thread Thread-Local Storage descriptor.
     */
    load_TLS(next, cpu);
    /*
     * Restore IOPL if needed.  In normal use, the flags restore
     * in the switch assembly will handle this.  But if the kernel
     * is running virtualized at a non-zero CPL, the popf will
     * not restore flags, so it must be done in a separate step.
     */
    if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
        set_iopl_mask(next->iopl);
    switch_to_extra(prev_p, next_p);
    /*
     * Leave lazy mode, flushing any hypercalls made here.
     * This must be done before restoring TLS segments so
     * the GDT and LDT are properly updated, and must be
     * done before fpu__restore(), so the TS bit is up
     * to date.
     */
    arch_end_context_switch(next_p);
    /*
     * Reload esp0 and cpu_current_top_of_stack.  This changes
     * current_thread_info().  Refresh the SYSENTER configuration in
     * case prev or next is vm86.
     */
    update_task_stack(next_p);
    refresh_sysenter_cs(next);
    this_cpu_write(cpu_current_top_of_stack,
               (unsigned long)task_stack_page(next_p) +
               THREAD_SIZE);
    /*
     * Restore %gs if needed (which is common)
     */
    if (prev->gs | next->gs)
        lazy_load_gs(next->gs);
    switch_fpu_finish(next_fpu, cpu);
    this_cpu_write(current_task, next_p);
    /* Load the Intel cache allocation PQR MSR. */
    resctrl_sched_in();
    return prev_p;
}

在完成了switch_to()的内核态切换后，还有一个重要的函数finish_task_switch()负责善后清理工作。在前面介绍switch_to三个参数的时候我们已经说明了使用last的重要性。而这里为何让prev和last均赋值为prev，是因为prev在后面没有需要用到，所以节省了一个指针空间来存储last。

/**
 * finish_task_switch - clean up after a task-switch
 * @prev: the thread we just switched away from.
 *
 * finish_task_switch must be called after the context switch, paired
 * with a prepare_task_switch call before the context switch.
 * finish_task_switch will reconcile locking set up by prepare_task_switch,
 * and do any other architecture-specific cleanup actions.
 *
 * Note that we may have delayed dropping an mm in context_switch(). If
 * so, we finish that here outside of the runqueue lock. (Doing it
 * with the lock held can cause deadlocks; see schedule() for
 * details.)
 *
 * The context switch have flipped the stack from under us and restored the
 * local variables which were saved when this task called schedule() in the
 * past. prev == current is still correct but we need to recalculate this_rq
 * because prev may have moved to another CPU.
 */
static struct rq *finish_task_switch(struct task_struct *prev)
    __releases(rq->lock)
{
    struct rq *rq = this_rq();
    struct mm_struct *mm = rq->prev_mm;
    long prev_state;
    /*
     * The previous task will have left us with a preempt_count of 2
     * because it left us after:
     *
     *	schedule()
     *	  preempt_disable();			// 1
     *	  __schedule()
     *	    raw_spin_lock_irq(&rq->lock)	// 2
     *
     * Also, see FORK_PREEMPT_COUNT.
     */
    if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
              "corrupted preempt_count: %s/%d/0x%x\n",
              current->comm, current->pid, preempt_count()))
        preempt_count_set(FORK_PREEMPT_COUNT);
    rq->prev_mm = NULL;
    /*
     * A task struct has one reference for the use as "current".
     * If a task dies, then it sets TASK_DEAD in tsk->state and calls
     * schedule one last time. The schedule call will never return, and
     * the scheduled task must drop that reference.
     *
     * We must observe prev->state before clearing prev->on_cpu (in
     * finish_task), otherwise a concurrent wakeup can get prev
     * running on another CPU and we could rave with its RUNNING -> DEAD
     * transition, resulting in a double drop.
     */
    prev_state = prev->state;
    vtime_task_switch(prev);
    perf_event_task_sched_in(prev, current);
    finish_task(prev);
    finish_lock_switch(rq);
    finish_arch_post_lock_switch();
    kcov_finish_switch(current);
    fire_sched_in_preempt_notifiers(current);
    /*
     * When switching through a kernel thread, the loop in
     * membarrier_{private,global}_expedited() may have observed that
     * kernel thread and not issued an IPI. It is therefore possible to
     * schedule between user->kernel->user threads without passing though
     * switch_mm(). Membarrier requires a barrier after storing to
     * rq->curr, before returning to userspace, so provide them here:
     *
     * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
     *   provided by mmdrop(),
     * - a sync_core for SYNC_CORE.
     */
    if (mm) {
        membarrier_mm_sync_core_before_usermode(mm);
        mmdrop(mm);
    }
    if (unlikely(prev_state == TASK_DEAD)) {
        if (prev->sched_class->task_dead)
            prev->sched_class->task_dead(prev);
        /*
         * Remove function-return probe instances associated with this
         * task and put them back on the free list.
         */
        kprobe_flush_task(prev);
        /* Task is done with its stack. */
        put_task_stack(prev);
        put_task_struct(prev);
    }
    tick_nohz_task_switch();
    return rq;
}

至此，我们完成了内核态的切换工作，也完成了整个主动调度的过程。

4.2 抢占式调度

抢占式调度通常发生在两种情况下。一种是某任务执行时间过长，另一种是当某任务被唤醒的时候。首先看看任务执行时间过长的情况。

4.2.1 任务运行时间检测

该情况需要衡量一个任务的执行时间长短，执行时间过长则发起抢占。在计算机里面有一个时钟，会过一段时间触发一次时钟中断，通知操作系统时间又过去一个时钟周期，通过这种方式可以查看是否是需要抢占的时间点。

时钟中断处理函数会调用scheduler_tick()。该函数首先取出当前CPU，并由此获取对应的运行队列rq和当前任务curr。接着调用该任务的调度类sched_class对应的task_tick()函数进行时间事件处理。

/*
 * This function gets called by the timer code, with HZ frequency.
 * We call it with interrupts disabled.
 */
void scheduler_tick(void)
{
    int cpu = smp_processor_id();
    struct rq *rq = cpu_rq(cpu);
    struct task_struct *curr = rq->curr;
    struct rq_flags rf;
    sched_clock_tick();
    rq_lock(rq, &rf);
    update_rq_clock(rq);
    curr->sched_class->task_tick(rq, curr, 0);
    cpu_load_update_active(rq);
    calc_global_load_tick(rq);
    psi_task_tick(rq);
    rq_unlock(rq, &rf);
    perf_event_task_tick();
......
}

以普通任务队列为例，对应的调度类为fair_sched_class，对应的时钟处理函数为task_tick_fair()，该函数会获取当前的调度实体和运行队列，并调用entity_tick()函数更新时间。

/*
 * scheduler tick hitting a task of our scheduling class.
 * NOTE: This function can be called remotely by the tick offload that
 * goes along full dynticks. Therefore no local assumption can be made
 * and everything must be accessed through the @rq and @curr passed in
 * parameters.
 */
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
    struct cfs_rq *cfs_rq;
    struct sched_entity *se = &curr->se;
    for_each_sched_entity(se) {
        cfs_rq = cfs_rq_of(se);
        entity_tick(cfs_rq, se, queued);
    }
    if (static_branch_unlikely(&sched_numa_balancing))
        task_tick_numa(rq, curr);
    update_misfit_status(curr, rq);
    update_overutilized_status(task_rq(curr));
}

在entity_tick()中，首先会调用update_curr()更新当前任务的vruntime，然后调用check_preempt_tick()检测现在是否可以发起抢占。

static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
    /*
     * Update run-time statistics of the 'current'.
     */
    update_curr(cfs_rq);
    /*
     * Ensure that runnable average is periodically updated.
     */
    update_load_avg(cfs_rq, curr, UPDATE_TG);
    update_cfs_group(curr);
......
    if (cfs_rq->nr_running > 1)
        check_preempt_tick(cfs_rq, curr);
}

check_preempt_tick() 先是调用 sched_slice() 函数计算出一个调度周期中该任务运行的实际时间 ideal_runtime。sum_exec_runtime 指任务总共执行的实际时间，prev_sum_exec_runtime 指上次该进程被调度时已经占用的实际时间，所以 sum_exec_runtime - prev_sum_exec_runtime 就是这次调度占用实际时间。如果这个时间大于 ideal_runtime，则应该被抢占了。除了这个条件之外，还会通过 __pick_first_entity 取出红黑树中最小的进程。如果当前进程的 vruntime 大于红黑树中最小的进程的 vruntime，且差值大于 ideal_runtime，也应该被抢占了。

/*
 * Preempt the current task with a newly woken task if needed:
 */
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
    unsigned long ideal_runtime, delta_exec;
    struct sched_entity *se;
    s64 delta;
    ideal_runtime = sched_slice(cfs_rq, curr);
    delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
    if (delta_exec > ideal_runtime) {
        resched_curr(rq_of(cfs_rq));
        /*
         * The current task ran long enough, ensure it doesn't get
         * re-elected due to buddy favours.
         */
        clear_buddies(cfs_rq, curr);
        return;
    }
    /*
     * Ensure that a task that missed wakeup preemption by a
     * narrow margin doesn't have to wait for a full slice.
     * This also mitigates buddy induced latencies under load.
     */
    if (delta_exec < sysctl_sched_min_granularity)
        return;
    se = __pick_first_entity(cfs_rq);
    delta = curr->vruntime - se->vruntime;
    if (delta < 0)
        return;
    if (delta > ideal_runtime)
        resched_curr(rq_of(cfs_rq));
}

如果确认需要被抢占，则会调用resched_curr()函数，该函数会调用set_tsk_need_resched()标记该任务为_TIF_NEED_RESCHED，即该任务应该被抢占。

/*
 * resched_curr - mark rq's current task 'to be rescheduled now'.
 *
 * On UP this means the setting of the need_resched flag, on SMP it
 * might also involve a cross-CPU call to trigger the scheduler on
 * the target CPU.
 */
void resched_curr(struct rq *rq)
{
    struct task_struct *curr = rq->curr;
    int cpu;
.......
    cpu = cpu_of(rq);
    if (cpu == smp_processor_id()) {
        set_tsk_need_resched(curr);
        set_preempt_need_resched();
        return;
    }
    if (set_nr_and_not_polling(curr))
        smp_send_reschedule(cpu);
    else
        trace_sched_wake_idle_without_ipi(cpu);
}

4.2.2 任务唤醒情况

某些任务会因为中断而唤醒，如当 I/O 到来的时候，I/O进程往往会被唤醒。在这种时候，如果被唤醒的任务优先级高于 CPU 上的当前任务，就会触发抢占。try_to_wake_up() 调用 ttwu_queue() 将这个唤醒的任务添加到队列当中。ttwu_queue() 再调用 ttwu_do_activate() 激活这个任务。ttwu_do_activate() 调用 ttwu_do_wakeup()。这里面调用了 check_preempt_curr() 检查是否应该发生抢占。如果应该发生抢占，也不是直接踢走当前进程，而是将当前进程标记为应该被抢占。

static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
         struct rq_flags *rf)
{
  check_preempt_curr(rq, p, wake_flags);
  p->state = TASK_RUNNING;
  trace_sched_wakeup(p);

4.2.3 抢占的发生

由前面的分析，我们知道了不论是是当前任务执行时间过长还是新任务唤醒，我们均会对现在的任务标记位_TIF_NEED_RESCUED，下面分析实际抢占的发生。真正的抢占还需要一个特定的时机让正在运行中的进程有机会调用一下 __schedule()函数，发起真正的调度。

实际上会调用__schedule()函数共有以下几个时机

从系统调用返回用户态：以64位为例，系统调用的链路为do_syscall_64->syscall_return_slowpath->prepare_exit_to_usermode->exit_to_usermode_loop。在exit_to_usermode_loop中，会检测是否为_TIF_NEED_RESCHED，如果是则调用__schedule()

static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
{
    while (true) {
        /* We have work to do. */
        local_irq_enable();

        if (cached_flags & _TIF_NEED_RESCHED)
          schedule();
......
  }

内核态启动：内核态的执行中，被抢占的时机一般发生在 preempt_enable() 中。在内核态的执行中，有的操作是不能被中断的，所以在进行这些操作之前，总是先调用 preempt_disable() 关闭抢占，当再次打开的时候，就是一次内核态代码被抢占的机会。preempt_enable() 会调用 preempt_count_dec_and_test()，判断 preempt_count 和 TIF_NEED_RESCHED 是否可以被抢占。如果可以，就调用 preempt_schedule->preempt_schedule_common->__schedule 进行调度。

#define preempt_enable() \
do { \
  if (unlikely(preempt_count_dec_and_test())) \
    __preempt_schedule(); \
} while (0)

#define preempt_count_dec_and_test() \
  ({ preempt_count_sub(1); should_resched(0); })

static __always_inline bool should_resched(int preempt_offset)
{
  return unlikely(preempt_count() == preempt_offset &&
      tif_need_resched());
}

#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)

static void __sched notrace preempt_schedule_common(void)
{
  do {
......
    __schedule(true);
......
  } while (need_resched())

从中断返回内核态/用户态：中断处理调用的是 do_IRQ 函数，中断完毕后分为两种情况，一个是返回用户态，一个是返回内核态。
- 返回用户态会调用 prepare_exit_to_usermode()，最终调用 exit_to_usermode_loop()
- 返回内核态会调用preempt_schedule_irq()，最终调用__schedule()

common_interrupt:
        ASM_CLAC
        addq    $-0x80, (%rsp) 
        interrupt do_IRQ
ret_from_intr:
        popq    %rsp
        testb   $3, CS(%rsp)
        jz      retint_kernel
/* Interrupt came from user space */
GLOBAL(retint_user)
        mov     %rsp,%rdi
        call    prepare_exit_to_usermode
        TRACE_IRQS_IRETQ
        SWAPGS
        jmp     restore_regs_and_iret
/* Returning to kernel space */
retint_kernel:
#ifdef CONFIG_PREEMPT
        bt      $9, EFLAGS(%rsp)  
        jnc     1f
0:      cmpl    $0, PER_CPU_VAR(__preempt_count)
        jnz     1f
        call    preempt_schedule_irq
        jmp     0b

asmlinkage __visible void __sched preempt_schedule_irq(void)
{
......
  do {
    preempt_disable();
    local_irq_enable();
    __schedule(true);
    local_irq_disable();
    sched_preempt_enable_no_resched();
  } while (need_resched());
......
}