cpuidle子系统之（三）：driver层

关注公众号不迷路：DumpStack

扫码加关注

一、从设备树中提取C state信息
- 1.1 设备树信息
  - 1.1.1 msm8916设备树
  - 1.1.2 msm8998设备树
- 1.2 从设备树中提取C state信息
二、示例一：cpuidle-arm - ARM32平台
三、示例二：cpuidle-big_little - ARM32平台
四、示例三：cpuidle-psci - ARM64平台
五、总结
- 5.1 怎样描述一个C state
- 5.2 怎样进入idle
关注公众号不迷路：DumpStack

driver层直接和硬件对接，回答了下面问题：

怎样进入指定的C state？是操作对应的寄存器，还是执行对应的指令？
什么时候退出idle？要怎样才能退出idle？是收到中断自动退出还是写相应的寄存器？
怎样理解不同等级的C state？

可见driver层是和平台强相关的一层，不同平台的cpu会有不同的driver

一、从设备树中提取C state信息

注意：这里从设备树中提取的是标准内核支持的C state属性，还有一些平台自定义的一些属性，在arm_cpu_idle中解析

文件位置：W:\opensource\linux-5.10.61\drivers\cpuidle\dt_idle_states.c

1.1 设备树信息

设备树中描述C state信息存在下面两种格式：

1.1.1 msm8916设备树

W:\opensource\linux-5.10.61\arch\arm64\boot\dts\qcom\msm8916.dtsi

cpus {

#address-cells = <1>;

#size-cells = <0>;

CPU0: cpu@0 {

device_type = "cpu";

compatible = "arm,cortex-a53";

reg = <0x0>;

next-level-cache = <&L2_0>;

enable-method = "psci";

clocks = <&apcs>;

operating-points-v2 = <&cpu_opp_table>;

#cooling-cells = <2>;

power-domains = <&CPU_PD0>; //指定这个cpu所属的电源域

power-domain-names = "psci";

};

CPU1: cpu@1 {

device_type = "cpu";

compatible = "arm,cortex-a53";

reg = <0x1>;

next-level-cache = <&L2_0>;

enable-method = "psci";

clocks = <&apcs>;

operating-points-v2 = <&cpu_opp_table>;

#cooling-cells = <2>;

power-domains = <&CPU_PD1>;

power-domain-names = "psci";

};

CPU2: cpu@2 {

device_type = "cpu";

compatible = "arm,cortex-a53";

reg = <0x2>;

next-level-cache = <&L2_0>;

enable-method = "psci";

clocks = <&apcs>;

operating-points-v2 = <&cpu_opp_table>;

#cooling-cells = <2>;

power-domains = <&CPU_PD2>;

power-domain-names = "psci";

};

CPU3: cpu@3 {

device_type = "cpu";

compatible = "arm,cortex-a53";

reg = <0x3>;

next-level-cache = <&L2_0>;

enable-method = "psci";

clocks = <&apcs>;

operating-points-v2 = <&cpu_opp_table>;

#cooling-cells = <2>;

power-domains = <&CPU_PD3>;

power-domain-names = "psci";

};

L2_0: l2-cache {

compatible = "cache";

cache-level = <2>;

};

# 描述不同等级的C state，本例只有一个等级

idle-states {

entry-method = "psci";

# 下面描述一个C state

CPU_SLEEP_0: cpu-sleep-0 {

compatible = "arm,idle-state";

idle-state-name = "standalone-power-collapse";

arm,psci-suspend-param = <0x40000002>;

entry-latency-us = <130>; #进入该状态延迟

exit-latency-us = <150>; #退出该状态延迟

min-residency-us = <2000>; #最小滞留时间

local-timer-stop; #在该状态下，local timer是否需要关闭

};

# 对不同的电源域的进入/退出延迟、最小滞留时间进行描述

domain-idle-states {

CLUSTER_RET: cluster-retention {

compatible = "domain-idle-state";

arm,psci-suspend-param = <0x41000012>;

entry-latency-us = <500>; #进入该状态延时

exit-latency-us = <500>; #退出该状态延迟

min-residency-us = <2000>; #最小滞留时间

};

CLUSTER_PWRDN: cluster-gdhs {

compatible = "domain-idle-state";

arm,psci-suspend-param = <0x41000032>;

entry-latency-us = <2000>; #进入该状态延时

exit-latency-us = <2000>; #退出该状态延迟

min-residency-us = <6000>; #最小滞留时间

};

#下面描述电源域

psci {

compatible = "arm,psci-1.0";

method = "smc";

CPU_PD0: power-domain-cpu0 {

#power-domain-cells = <0>;

power-domains = <&CLUSTER_PD>;

domain-idle-states = <&CPU_SLEEP_0>; #描述这个电源域的C state

};

CPU_PD1: power-domain-cpu1 {

#power-domain-cells = <0>;

power-domains = <&CLUSTER_PD>;

domain-idle-states = <&CPU_SLEEP_0>;

};

CPU_PD2: power-domain-cpu2 {

#power-domain-cells = <0>;

power-domains = <&CLUSTER_PD>;

domain-idle-states = <&CPU_SLEEP_0>;

};

CPU_PD3: power-domain-cpu3 {

#power-domain-cells = <0>;

power-domains = <&CLUSTER_PD>;

domain-idle-states = <&CPU_SLEEP_0>;

};

CLUSTER_PD: power-domain-cluster {

#power-domain-cells = <0>;

domain-idle-states = <&CLUSTER_RET>, <&CLUSTER_PWRDN>;

};

1.1.2 msm8998设备树

文件位置：W:\opensource\linux-5.10.61\arch\arm64\boot\dts\qcom\msm8998.dtsi

cpus {

#address-cells = <2>;

#size-cells = <0>;

CPU0: cpu@0 {

device_type = "cpu";

compatible = "qcom,kryo280";

reg = <0x0 0x0>;

enable-method = "psci";

#这个cpu支持的idle等级，这里表示有两级idle

cpu-idle-states = <&LITTLE_CPU_SLEEP_0 &LITTLE_CPU_SLEEP_1>;

next-level-cache = <&L2_0>;

L2_0: l2-cache {

compatible = "arm,arch-cache";

cache-level = <2>;

};

L1_I_0: l1-icache {

compatible = "arm,arch-cache";

};

L1_D_0: l1-dcache {

compatible = "arm,arch-cache";

};

CPU1: cpu@1 {

device_type = "cpu";

compatible = "qcom,kryo280";

reg = <0x0 0x1>;

enable-method = "psci";

cpu-idle-states = <&LITTLE_CPU_SLEEP_0 &LITTLE_CPU_SLEEP_1>;

next-level-cache = <&L2_0>;

L1_I_1: l1-icache {

compatible = "arm,arch-cache";

};

L1_D_1: l1-dcache {

compatible = "arm,arch-cache";

};

CPU2: cpu@2 {

device_type = "cpu";

compatible = "qcom,kryo280";

reg = <0x0 0x2>;

enable-method = "psci";

cpu-idle-states = <&LITTLE_CPU_SLEEP_0 &LITTLE_CPU_SLEEP_1>;

next-level-cache = <&L2_0>;

L1_I_2: l1-icache {

compatible = "arm,arch-cache";

};

L1_D_2: l1-dcache {

compatible = "arm,arch-cache";

};

CPU3: cpu@3 {

device_type = "cpu";

compatible = "qcom,kryo280";

reg = <0x0 0x3>;

enable-method = "psci";

cpu-idle-states = <&LITTLE_CPU_SLEEP_0 &LITTLE_CPU_SLEEP_1>;

next-level-cache = <&L2_0>;

L1_I_3: l1-icache {

compatible = "arm,arch-cache";

};

L1_D_3: l1-dcache {

compatible = "arm,arch-cache";

};

CPU4: cpu@100 {

device_type = "cpu";

compatible = "qcom,kryo280";

reg = <0x0 0x100>;

enable-method = "psci";

cpu-idle-states = <&BIG_CPU_SLEEP_0 &BIG_CPU_SLEEP_1>; #大核的C state

next-level-cache = <&L2_1>;

L2_1: l2-cache {

compatible = "arm,arch-cache";

cache-level = <2>;

};

L1_I_100: l1-icache {

compatible = "arm,arch-cache";

};

L1_D_100: l1-dcache {

compatible = "arm,arch-cache";

};

CPU5: cpu@101 {

device_type = "cpu";

compatible = "qcom,kryo280";

reg = <0x0 0x101>;

enable-method = "psci";

cpu-idle-states = <&BIG_CPU_SLEEP_0 &BIG_CPU_SLEEP_1>;

next-level-cache = <&L2_1>;

L1_I_101: l1-icache {

compatible = "arm,arch-cache";

};

L1_D_101: l1-dcache {

compatible = "arm,arch-cache";

};

CPU6: cpu@102 {

device_type = "cpu";

compatible = "qcom,kryo280";

reg = <0x0 0x102>;

enable-method = "psci";

cpu-idle-states = <&BIG_CPU_SLEEP_0 &BIG_CPU_SLEEP_1>;

next-level-cache = <&L2_1>;

L1_I_102: l1-icache {

compatible = "arm,arch-cache";

};

L1_D_102: l1-dcache {

compatible = "arm,arch-cache";

};

CPU7: cpu@103 {

device_type = "cpu";

compatible = "qcom,kryo280";

reg = <0x0 0x103>;

enable-method = "psci";

cpu-idle-states = <&BIG_CPU_SLEEP_0 &BIG_CPU_SLEEP_1>;

next-level-cache = <&L2_1>;

L1_I_103: l1-icache {

compatible = "arm,arch-cache";

};

L1_D_103: l1-dcache {

compatible = "arm,arch-cache";

};

cpu-map {

cluster0 {

core0 {

cpu = <&CPU0>;

};

core1 {

cpu = <&CPU1>;

};

core2 {

cpu = <&CPU2>;

};

core3 {

cpu = <&CPU3>;

};

cluster1 {

core0 {

cpu = <&CPU4>;

};

core1 {

cpu = <&CPU5>;

};

core2 {

cpu = <&CPU6>;

};

core3 {

cpu = <&CPU7>;

};

# 不同的C state的信息

idle-states {

entry-method = "psci";

LITTLE_CPU_SLEEP_0: cpu-sleep-0-0 {

compatible = "arm,idle-state";

idle-state-name = "little-retention";

arm,psci-suspend-param = <0x00000002>;

entry-latency-us = <81>; #进入该C state的延迟

exit-latency-us = <86>; #退出该C state的延迟

min-residency-us = <200>; #最小滞留时间

};

LITTLE_CPU_SLEEP_1: cpu-sleep-0-1 {

compatible = "arm,idle-state";

idle-state-name = "little-power-collapse";

arm,psci-suspend-param = <0x40000003>;

entry-latency-us = <273>;

exit-latency-us = <612>;

min-residency-us = <1000>;

local-timer-stop; #在该C state下，local timer关闭

};

BIG_CPU_SLEEP_0: cpu-sleep-1-0 {

compatible = "arm,idle-state";

idle-state-name = "big-retention";

arm,psci-suspend-param = <0x00000002>;

entry-latency-us = <79>;

exit-latency-us = <82>;

min-residency-us = <200>;

};

BIG_CPU_SLEEP_1: cpu-sleep-1-1 {

compatible = "arm,idle-state";

idle-state-name = "big-power-collapse";

arm,psci-suspend-param = <0x40000003>;

entry-latency-us = <336>;

exit-latency-us = <525>;

min-residency-us = <1000>;

local-timer-stop;

};

1.2 从设备树中提取C state信息

1.2.1 dt_init_idle_driver - 每个cpu调用一次该函数

每个cpu调用一次该函数，初始化这个cpu的C state信息

/**

* dt_init_idle_driver() - Parse the DT idle states and initialize the

* idle driver states array

* @drv: Pointer to CPU idle driver to be initialized

* @matches: Array of of_device_id match structures to search in for

* compatible idle state nodes. The data pointer for each valid

* struct of_device_id entry in the matches array must point to

* a function with the following signature, that corresponds to

* the CPUidle state enter function signature:

* int (*)(struct cpuidle_device *dev,

* struct cpuidle_driver *drv,

* int index);

* @start_idx: First idle state index to be initialized

* If DT idle states are detected and are valid the state count and states

* array entries in the cpuidle driver are initialized accordingly starting

* from index start_idx.

* Return: number of valid DT idle states parsed, <0 on failure

int dt_init_idle_driver(

struct cpuidle_driver *drv, //为哪个driver解析C state

const struct of_device_id *matches, //match表中记录着进入idle的函数

unsigned int start_idx) //要初始化的第一个C state

{

struct cpuidle_state *idle_state;

struct device_node *state_node, *cpu_node;

const struct of_device_id *match_id;

int i, err = 0;

const cpumask_t *cpumask;

unsigned int state_idx = start_idx;

//1.最多允许支持10个C state

if (state_idx >= CPUIDLE_STATE_MAX)

return -EINVAL;

* We get the idle states for the first logical cpu in the

* driver mask (or cpu_possible_mask if the driver cpumask is not set)

* and we check through idle_state_valid() if they are uniform

* across CPUs, otherwise we hit a firmware misconfiguration.

//2.获取这个driver控制的cpu，未指定的话就是所有可能的cpu

cpumask = drv->cpumask ? : cpu_possible_mask;

//3.获取设备树中的cpu的节点

cpu_node = of_cpu_device_node_get(cpumask_first(cpumask));

//4.遍历每一级C state

for (i = 0; ; i++) {

//4.获取这个cpu的idle等级对应的设备树节点

state_node = of_get_cpu_state_node(cpu_node, i);

if (!state_node)

break;

//5.通过match表找到对应的节点，假设match表为"arm,idle-state"

match_id = of_match_node(matches, state_node);

if (!match_id) {

err = -ENODEV;

break;

}

//6.判断节点是否有效

if (!of_device_is_available(state_node)) {

of_node_put(state_node);

continue;

}

//7.校验cpumask中的所有cpu，在第i级的idle状态是不是一样的

// 也就是说：同一个driver中的所有的cpu的所有idle层级都应该是一样的

if (!idle_state_valid(state_node, i, cpumask)) {

pr_warn("%pOF idle state not valid, bailing out\n",

state_node);

err = -EINVAL;

break;

}

//8.最多遍历10级C state

if (state_idx == CPUIDLE_STATE_MAX) {

pr_warn("State index reached static CPU idle driver states array size\n");

break;

}

//9.获取该层级对应的cpuidle_state结构的空间，

// 下面要从设备树中解析参数来填充这个空间了

idle_state = &drv->states[state_idx++];

err = init_state_node(idle_state, match_id, state_node);

if (err) {

pr_err("Parsing idle state node %pOF failed with err %d\n",

state_node, err);

err = -EINVAL;

break;

}

of_node_put(state_node);

}

of_node_put(state_node);

of_node_put(cpu_node);

if (err)

return err;

* Update the driver state count only if some valid DT idle states

* were detected

if (i)

drv->state_count = state_idx;

* Return the number of present and valid DT idle states, which can

* also be 0 on platforms with missing DT idle states or legacy DT

* configuration predating the DT idle states bindings.

return i;

}

1.2.2 of_get_cpu_state_node - 从设备树中找出描述idle等级的节点

找出指定cpu的C state节点，有两种格式的设备树用于描述这个cpu可用哪些C state

第一种格式是在cpu节点中使用power-domains和#power-domain-cells属性描述其所属的电源域节点，然后在根据电源域中的domain-idle-states属性描述对应的C state节点
另一种格式是直接在cpu节点中通过cpu-idle-states属性找到这个cpu支持的C state节点

/**

* of_get_cpu_state_node - Get CPU's idle state node at the given index

* @cpu_node: The device node for the CPU

* @index: The index in the list of the idle states

* Two generic methods can be used to describe a CPU's idle states, either via

* a flattened description through the "cpu-idle-states" binding or via the

* hierarchical layout, using the "power-domains" and the "domain-idle-states"

* bindings. This function check for both and returns the idle state node for

* the requested index.

* In case an idle state node is found at @index, the refcount is incremented

* for it, so call of_node_put() on it when done. Returns NULL if not found.

struct device_node *of_get_cpu_state_node(

struct device_node *cpu_node, //哪个cpu

int index) //idx表示idle等级

{

struct of_phandle_args args;

int err;

//1.第一种格式：msm8916

// 获取到CPU_PDx对应的设备树节点，赋值给args

err = of_parse_phandle_with_args(cpu_node, "power-domains",

"#power-domain-cells", 0, &args);

if (!err) {

//2.找到指定的idle等级对应的设备树

struct device_node *state_node =

of_parse_phandle(args.np, "domain-idle-states", index);

of_node_put(args.np);

if (state_node)

return state_node;

}

//2.第二种格式：msm8998

return of_parse_phandle(cpu_node, "cpu-idle-states", index);

}

1.2.3 idle_state_valid - 检验在idx级的idle状态是否一致

该函数的功能是：校验cpumask中的cpu，在第idx层级的idle状态对应的设备树节点是不是state_node，也就是说cpumask中所有cpu在第idx层级的idle状态应该是一样的

* Check that the idle state is uniform across all CPUs in the CPUidle driver

* cpumask

static bool idle_state_valid(

struct device_node *state_node, //idle等级对应的设备树节点

unsigned int idx, //idx表示idle的等级

const cpumask_t *cpumask)

{

int cpu;

struct device_node *cpu_node, *curr_state_node;

bool valid = true;

* Compare idle state phandles for index idx on all CPUs in the

* CPUidle driver cpumask. Start from next logical cpu following

* cpumask_first(cpumask) since that's the CPU state_node was

* retrieved from. If a mismatch is found bail out straight

* away since we certainly hit a firmware misconfiguration.

//1.遍历cpumask中的所有cpu

for (cpu = cpumask_next(cpumask_first(cpumask), cpumask);

cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpumask)) {

//2.通过下面两步，找到这个cpu的第idx对应的idle等级对应的设备树节点

cpu_node = of_cpu_device_node_get(cpu);

curr_state_node = of_get_cpu_state_node(cpu_node, idx);

//3.如果这两个值不一样，则说明这个cpu的第idx级idle状态，并不是state_node

if (state_node != curr_state_node)

valid = false;

of_node_put(curr_state_node);

of_node_put(cpu_node);

if (!valid)

break;

}

return valid;

}

1.2.4 init_state_node - 通过设备树解析一个C state

static int init_state_node(

struct cpuidle_state *idle_state, //解析后存放在哪

const struct of_device_id *match_id, //match表

struct device_node *state_node) //要解析哪一级的C state

{

int err;

const char *desc;

* CPUidle drivers are expected to initialize the const void *data

* pointer of the passed in struct of_device_id array to the idle

* state enter function.

//1.进入该C state的回调函数

idle_state->enter = match_id->data;

* Since this is not a "coupled" state, it's safe to assume interrupts

* won't be enabled when it exits allowing the tick to be frozen

* safely. So enter() can be also enter_s2idle() callback.

idle_state->enter_s2idle = match_id->data;

//2.从设备树中解析从该C state退出时的延迟

// 如果存在"wakeup-latency-us"，of_property_read_u32函数返回0，

// 不存在该属性时，进入if分支，通过其他属性解析

err = of_property_read_u32(state_node, "wakeup-latency-us", &idle_state->exit_latency);

if (err) {

u32 entry_latency, exit_latency;

//2.1 依次读出"entry-latency-us"和"exit-latency-us"属性

err = of_property_read_u32(state_node, "entry-latency-us", &entry_latency);

if (err) {

pr_debug(" * %pOF missing entry-latency-us property\n", state_node);

return -EINVAL;

}

err = of_property_read_u32(state_node, "exit-latency-us", &exit_latency);

if (err) {

pr_debug(" * %pOF missing exit-latency-us property\n", state_node);

return -EINVAL;

}

* If wakeup-latency-us is missing, default to entry+exit

* latencies as defined in idle states bindings

//2.2 两者的和表示从该级C state退出的耗时

idle_state->exit_latency = entry_latency + exit_latency;

}

//3.解析"min-residency-us"，该属性表示该C state的"最小滞留时间"

err = of_property_read_u32(state_node, "min-residency-us",

&idle_state->target_residency);

if (err) {

pr_debug(" * %pOF missing min-residency-us property\n", state_node);

return -EINVAL;

}

//4.解析这个C state的名称，若没有指定该属性，则直接使用C state的名称

err = of_property_read_string(state_node, "idle-state-name", &desc);

if (err)

desc = state_node->name;

//5.从设备树中解析flags信息，表示进入该C state是否需要关闭时钟

idle_state->flags = 0;

if (of_property_read_bool(state_node, "local-timer-stop"))

idle_state->flags |= CPUIDLE_FLAG_TIMER_STOP;

* TODO:

* replace with kstrdup and pointer assignment when name

* and desc become string pointers

//6.解析该C state的name和desc信息

strncpy(idle_state->name, state_node->name, CPUIDLE_NAME_LEN - 1);

strncpy(idle_state->desc, desc, CPUIDLE_DESC_LEN - 1);

return 0;

}

二、示例一：cpuidle-arm - ARM32平台

使能CONFIG_ARM_CPUIDLE宏会启用该cpuidle-arm，虽然arm32和arm64都有可能会启用该宏，但是实际上在Linux-5.10.61开源内核中，只有arm32中的omap2的pm33xx平台真正的把cpuidle-arm给用起来了。

这是因为cpuidle-arm在初始化的过程中，执行到arm_cpuidle_init的时候，会获取一个ops，在arm32平台中只有omap2的pm33xx平台会真正的注册这个ops，而在arm64平台中，获取到的ops中不会有全部没有设置cpu_suspend函数，所以arm_cpuidle_init会直接返回-EOPNOTSUPP，所以在cpuidle-arm初始化过程中也就不会注册cpuidle_driver驱动（除了arm32的omap2的pm33xx平台），cpuidle-arm也就没有真正的生效

那么arm32和arm64上怎样进入idle呢？实际上：

arm32中，只有normal和idle两个状态，idle也没有等级之分，所以只需要wfi进入idle即可
arm64中，基本都是使用cpuidle-psci驱动，关于psci我们在后面介绍

PS：我在手机上做了一个实验，在arm_idle_init函数刚进来的时候直接return掉，手机的各项功能也是正常的，可以正常进入退出idle状态，这也证明了在arm64平台上，cpuidle-arm这个驱动根本就没有起作用

下面我们主要以arm32的omap2的pm33xx平台为例，分析一下cpuidle-arm驱动

文件位置：W:\opensource\linux-5.10.61\drivers\cpuidle\cpuidle-arm.c

2.1 arm32中的cpuidle_ops结构组织关系

在arm32平台中，定义了cpuidle_ops数据结构，由该数据结构定义的位置也可以知道，该数据结构是只为arm32提供的，该数据结构定义了两个方法：

suspend: 指定这个cpu如何进入指定级别的C state
init: 初始化指定的cpu

W:\opensource\linux-5.10.61\arch\arm\include\asm\cpuidle.h

struct cpuidle_ops {

int (*suspend)(unsigned long arg); //指定这个cpu如何进入指定级别的C state

int (*init)(struct device_node *, int cpu); //初始化指定的cpu

};

2.1.1 全局数组cpuidle_ops[NR_CPUS]

每个cpu定义一个cpuidle_ops类型的变量，如下

W:\opensource\linux-5.10.61\arch\arm\kernel\cpuidle.c

static struct cpuidle_ops cpuidle_ops[NR_CPUS] __ro_after_init;

全局的cpuidle_ops[]数组在哪设置？怎样获取指定cpu对应的cpuidle_ops结构？

2.1.2 填充__cpuidle_method_of_table段

下面的CPUIDLE_METHOD_OF_DECLARE完成的工作是将这个amx3_cpuidle_ops放入一个名为__cpuidle_method_of_table的段中，然后在需要的时候再从这个段中将这些ops读出来，并赋值给全局的cpuidle_ops，这是Linux的惯用伎俩，简单看一下吧原理吧

U:\linux-5.10.61\arch\arm\mach-omap2\pm33xx-core.c

static struct cpuidle_ops amx3_cpuidle_ops __initdata = {

.init = amx3_idle_init,

.suspend = amx3_idle_enter,

};

CPUIDLE_METHOD_OF_DECLARE(pm33xx_idle, "ti,am3352", &amx3_cpuidle_ops);

CPUIDLE_METHOD_OF_DECLARE(pm43xx_idle, "ti,am4372", &amx3_cpuidle_ops);

第一步：往__cpuidle_method_of_table段里面塞ops，CPUIDLE_METHOD_OF_DECLARE实现如下

#define CPUIDLE_METHOD_OF_DECLARE(name, _method, _ops) \

static const struct of_cpuidle_method __cpuidle_method_of_table_##name \

__cpuidle_method_section = { .method = _method, .ops = _ops }

其中：

#define __cpuidle_method_section __used __section("__cpuidle_method_of_table")

第二步：在连接器脚本中标记这个段的起始和结束

T:\arch\arm64\kernel\vmlinux.lds

...

. = ALIGN(8);

__cpuidle_method_of_table = .;

KEEP(*(__cpuidle_method_of_table))

KEEP(*(__cpuidle_method_of_table_end))

...

第三步：在C中将这个段的起始和结束声明为外部变量，这样在C中就能正大光明的使用了

U:\linux-5.10.61\arch\arm\kernel\cpuidle.c

extern struct of_cpuidle_method __cpuidle_method_of_table[];

第四步：使用__cpuidle_method_of_table这个变量，获取cpuidle_ops

/**

* arm_cpuidle_get_ops() - find a registered cpuidle_ops by name

* @method: the method name

* Search in the __cpuidle_method_of_table array the cpuidle ops matching the

* method name.

* Returns a struct cpuidle_ops pointer, NULL if not found.

static const struct cpuidle_ops *__init arm_cpuidle_get_ops(const char *method)

{

struct of_cpuidle_method *m = __cpuidle_method_of_table;

//遍历段中的所有cpuidle_ops，直到找到method指定的那个cpuidle_ops

for (; m->method; m++)

if (!strcmp(m->method, method))

return m->ops;

return NULL;

}

2.1.3 arm_cpuidle_get_ops - 根据字符串，从段中找到对于的cpuidle_ops

该函数在上一节已经介绍过，不再赘述，调用关系如下：

2.1.4 arm_cpuidle_read_ops - 获取cpu对应的cpuidle_ops，并赋值给全局数组变量cpuidle_ops[]

文件位置：W:\opensource\linux-5.10.61\arch\arm\kernel\cpuidle.c

/**

* arm_cpuidle_read_ops() - Initialize the cpuidle ops with the device tree

* @dn: a pointer to a struct device node corresponding to a cpu node

* @cpu: the cpu identifier

* Get the method name defined in the 'enable-method' property, retrieve the

* associated cpuidle_ops and do a struct copy. This copy is needed because all

* cpuidle_ops are tagged __initconst and will be unloaded after the init

* process.

* Return 0 on sucess, -ENOENT if no 'enable-method' is defined, -EOPNOTSUPP if

* no cpuidle_ops is registered for the 'enable-method', or if either init or

* suspend callback isn't defined.

static int __init arm_cpuidle_read_ops(

struct device_node *dn, //这个cpu对于的设备树节点

int cpu)

{

const char *enable_method;

const struct cpuidle_ops *ops;

//1.先从设备树中获取enable-method属性，以便后面从段中取出对于的cpuidle_ops结构

enable_method = of_get_property(dn, "enable-method", NULL);

if (!enable_method)

return -ENOENT;

//2.根据上面获取到的enable-method属性，从段中取出对于的cpuidle_ops结构

// 实际上大部分ARM32平台都在这里返回了-EOPNOTSUPP，目前只有omap的一个

// 平台设置了这个ops，所以大部分的arm32平台因为找不到ops导致cpuidle-arm初始化失败

ops = arm_cpuidle_get_ops(enable_method);

if (!ops) {

pr_warn("%pOF: unsupported enable-method property: %s\n",

dn, enable_method);

return -EOPNOTSUPP;

}

//3.在ARM32平台中，要求cpuidle_ops必须设置了这两个回调函数

if (!ops->init || !ops->suspend) {

pr_warn("cpuidle_ops '%s': no init or suspend callback\n",

enable_method);

return -EOPNOTSUPP;

}

//4.赋值给全局变量，以便后期使用

cpuidle_ops[cpu] = *ops; /* structure copy */

pr_notice("cpuidle: enable-method property '%s'"

" found operations\n", enable_method);

return 0;

}

2.2 arm_idle_init - cpuidle_arm驱动初始化流程

cpuidle_arm驱动初始化的时候调用该函数，实现为所有cpu初始化cpuidle_driver

* arm_idle_init - Initializes arm cpuidle driver

* Initializes arm cpuidle driver for all CPUs, if any CPU fails

* to register cpuidle driver then rollback to cancel all CPUs

* registeration.

static int __init arm_idle_init(void)

{

int cpu, ret;

struct cpuidle_driver *drv;

struct cpuidle_device *dev;

//遍历系统中每一个可能的cpu

for_each_possible_cpu(cpu) {

ret = arm_idle_init_cpu(cpu);

if (ret)

goto out_fail;

}

return 0;

out_fail:

while (--cpu >= 0) {

dev = per_cpu(cpuidle_devices, cpu);

drv = cpuidle_get_cpu_driver(dev);

cpuidle_unregister(drv);

kfree(drv);

}

return ret;

}

device_initcall(arm_idle_init); //cpuidle_arm驱动初始化的时候调用

2.3 arm_idle_init_cpu - 为指定的cpu初始化cpuidle_driver

U:\linux-5.10.61\drivers\cpuidle\cpuidle-arm.c

* arm_idle_init_cpu

* Registers the arm specific cpuidle driver with the cpuidle

* framework. It relies on core code to parse the idle states

* and initialize them using driver data structures accordingly.

static int __init arm_idle_init_cpu(int cpu)

{

int ret;

struct cpuidle_driver *drv;

//1.因为每个cpu都有自己的driver，首先copy一份模版

drv = kmemdup(&arm_idle_driver, sizeof(*drv), GFP_KERNEL);

if (!drv)

return -ENOMEM;

//2.这个mask中仅包含一个cpu，可见每个cpu对应一个driver

drv->cpumask = (struct cpumask *)cpumask_of(cpu);

* Initialize idle states data, starting at index 1. This

* driver is DT only, if no DT idle states are detected (ret

* == 0) let the driver initialization fail accordingly since

* there is no reason to initialize the idle driver if only

* wfi is supported.

//3.从设备树中提取C state信息，state1及其之后的C state必须从设备树中传进来

// 注意，下面传入的参数为1，表示从设备树中获取除了state0之外的所有state，

// state0默认为wfi state，具体可参见下面静态定义的arm_idle_driver

ret = dt_init_idle_driver(drv, arm_idle_state_match, 1);

if (ret <= 0) {

ret = ret ? : -ENODEV;

goto out_kfree_drv;

}

* Call arch CPU operations in order to initialize

* idle states suspend back-end specific data

//4.给个机会给平台端，让平台在设备树中读取自定义的C state信息

// 注意：对于ARM64平台，该函数返回-EOPNOTSUPP

ret = arm_cpuidle_init(cpu);

* Allow the initialization to continue for other CPUs, if the

* reported failure is a HW misconfiguration/breakage (-ENXIO).

* Some platforms do not support idle operations

* (arm_cpuidle_init() returning -EOPNOTSUPP), we should

* not flag this case as an error, it is a valid

* configuration.

//5.注意：对于ARM64在这里就返回了，没有执行下面的驱动注册，

// ARM64平台真正的注册的地方是在psci驱动初始化的地方，具

// 体参见psci_idle_init_cpu

if (ret) {

if (ret != -EOPNOTSUPP)

pr_err("CPU %d failed to init idle CPU ops\n", cpu);

ret = ret == -ENXIO ? 0 : ret;

goto out_kfree_drv;

}

//6.只有ARM32平台才会走到这里，ARM64在上面退出了

// 注册这个driver

ret = cpuidle_register(drv, NULL);

if (ret)

goto out_kfree_drv;

//7.暂不分析

cpuidle_cooling_register(drv);

return 0;

out_kfree_drv:

kfree(drv);

return ret;

}

2.3.1 arm_idle_driver - cpuidle_driver模版

由下面的注释可知：所有的ARM平台，都应提供默认的WFI standby状态，作为idle state 0，如果有例外，则需要在DTS中另行处理；

注意到：对于state0，其exit latency和target residency均为1（最小值），power usage为整数中的最大值。由此可以看出，这些信息不是实际信息（因为driver不可能知道所有ARM平台的WFI相关的信息），而是相对信息，其中的含义是：所有其它的state的exit latency和target residency都会比state0大，power usage都会比state0小

static struct cpuidle_driver arm_idle_driver __initdata = {

.name = "arm_idle",

.owner = THIS_MODULE,

* State at index 0 is standby wfi and considered standard

* on all ARM platforms. If in some platforms simple wfi

* can't be used as "state 0", DT bindings must be implemented

* to work around this issue and allow installing a special

* handler for idle state index 0.

.states[0] = {

.enter = arm_enter_idle_state, //进入该C state的方法

.exit_latency = 1, //退出延迟

.target_residency = 1, //最小滞留时间

.power_usage = UINT_MAX, //功耗

.name = "WFI",

.desc = "ARM WFI",

}

};

2.3.2 arm_idle_state_match - match表，用于从设备树中提取C state信息

static const struct of_device_id arm_idle_state_match[] __initconst = {

{ .compatible = "arm,idle-state",

.data = arm_enter_idle_state }, //data中指定进入指定C state的函数

{ },

};

2.3.3 arm_enter_idle_state - 进入指定级别的C state的方法

arm_enter_idle_state使指定的cpu进入指定级别的C state，调用的函数如下，这两个函数我们在下一章单独讲解，其中idx表示C state的级别

当idx为0时，调用cpu_do_idle
当idx不为0时，调用arm_cpuidle_suspend

函数返回时表示已经从idle中退出，返回值表示上传所处的C state

实现位置：U:\linux-5.10.61\drivers\cpuidle\cpuidle-arm.c

* arm_enter_idle_state - Programs CPU to enter the specified state

* dev: cpuidle device

* drv: cpuidle driver

* idx: state index

* Called from the CPUidle framework to program the device to the

* specified target state selected by the governor.

static int arm_enter_idle_state(

struct cpuidle_device *dev, //要进入idle的cpu

struct cpuidle_driver *drv, //进入idle所使用的驱动

int idx) //要进入idle的级别

{

* Pass idle state index to arm_cpuidle_suspend which in turn

* will call the CPU ops suspend protocol with idle index as a

* parameter.

//arm_cpuidle_suspend方法的具体实现在下一章讲解

return CPU_PM_CPU_IDLE_ENTER(arm_cpuidle_suspend, idx);

}

2.3.4 CPU_PM_CPU_IDLE_ENTER

如果是要进入idle state0（即WFI），调用传统cpu_do_idle接口

对于其它的state，首先调用cpu_pm_enter，发出CPU即将进入low power state的通知，成功后调用指定的low_level_idle_enter接口，也就是arm_cpuidle_suspend（arm32）或psci_cpu_suspend_enter（arm64）接口，让cpu进入指定的idle状态，最后，从idle返回时，再次发送退出low power state的通知；

#define __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, \ //回调函数

idx, \ //要进入的C state级别

state, \ //平台自定义的C state对应的数据

is_retention) \

({ \

int __ret = 0; \

if (!idx) { \ //如果指定的等级为0，则执行这个函数

cpu_do_idle(); \

return idx; \

} \

if (!is_retention) \

__ret = cpu_pm_enter(); \

if (!__ret) { \

__ret = low_level_idle_enter(state); \ //调用回调函数进入直接级别的idle

if (!is_retention) \

cpu_pm_exit(); \

} \

__ret ? -1 : idx; \

})

#define CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx) \

__CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, idx, 0)

2.4 arm_cpuidle_init - 读取平台在设备树中自定义的C state信息

2.4.1 arm64实现：直接返回-EOPNOTSUPP

arm64实现如下，因为在linux-5.10.61开源内核中，arm64获取到的ops没有对cpu_suspend和cpu_init_idle接口进行了实现（这个后面会分析），所以该函数直接返回-EOPNOTSUPP

文件位置：W:\opensource\linux-5.10.61\arch\arm64\kernel\cpuidle.c

int arm_cpuidle_init(unsigned int cpu)

{

const struct cpu_operations *ops = get_cpu_ops(cpu);

int ret = -EOPNOTSUPP;

//注意:

// 在ARM64平台，上面获取到的ops为cpu_psci_ops，这个ops没有

// 定义cpu_suspend和cpu_init_idle，该函数直接返回EOPNOTSUPP

if (ops && ops->cpu_suspend && ops->cpu_init_idle)

ret = ops->cpu_init_idle(cpu);

return ret;

}

2.4.2 arm32实现：大部分返回-EOPNOTSUPP，仅omap的一个平台有效

arm32实现如下，在获取ops的时候，只有omap2的pm33xx平台会真正的注册这个ops，所以在这里大部分平台都会返回-EOPNOTSUPP，只有omap2的pm33xx平台会继续走下去

文件位置：W:\opensource\linux-5.10.61\arch\arm\kernel\cpuidle.c

/**

* arm_cpuidle_init() - Initialize cpuidle_ops for a specific cpu

* @cpu: the cpu to be initialized

* Initialize the cpuidle ops with the device for the cpu and then call

* the cpu's idle initialization callback. This may fail if the underlying HW

* is not operational.

* Returns:

* 0 on success,

* -ENODEV if it fails to find the cpu node in the device tree,

* -EOPNOTSUPP if it does not find a registered and valid cpuidle_ops for

* this cpu,

* -ENOENT if it fails to find an 'enable-method' property,

* -ENXIO if the HW reports a failure or a misconfiguration,

* -ENOMEM if the HW report an memory allocation failure

int __init arm_cpuidle_init(int cpu)

{

//1.获取这个cpu对应的设备树节点

struct device_node *cpu_node = of_cpu_device_node_get(cpu);

int ret;

if (!cpu_node)

return -ENODEV;

//2.读出这个cpu对应的cpuidle_ops，并赋值给全局数组cpuidle_ops[]

// 如果平台端没有注册了ops的话，这里会返回-EOPNOTSUPP

ret = arm_cpuidle_read_ops(cpu_node, cpu);

//3.调用cpuidle_ops对应的init函数，完成平台自己关心的初始化工作

// 例如在omap2平台中，会在这个init中继续从设备树中获取自定义的

// C state信息，实际上omap2的这个平台就是通过这些信息，决定在执

// 行wfi命令之前需要关闭哪些外设，进而区分不同等级的C state

if (!ret)

ret = cpuidle_ops[cpu].init(cpu_node, cpu);

of_node_put(cpu_node);

return ret;

}

2.4.2.1 amx3_idle_init - omap2平台自定义C state信息

由前面分析可知，在omap2平台上，cpuidle_ops[cpu].init指定为amx3_idle_init，实现如下

W:\opensource\linux-5.10.61\arch\arm\mach-omap2\pm33xx-core.c

static int __init amx3_idle_init(struct device_node *cpu_node, int cpu)

{

struct device_node *state_node;

struct amx3_idle_state states[CPUIDLE_STATE_MAX];

int i;

int state_count = 1;

//1.查看设备树中国是否有指定的属性，并初始化wfi_flags

// wfi_flags用于指导：在执行wfi之前要执行哪些操作，

// 例如刷cache，关闭片上外设等操作

for (i = 0; ; i++) {

//2.通过for循环，获取这个cpu支持的所有C state对应的节点

state_node = of_parse_phandle(cpu_node, "cpu-idle-states", i);

if (!state_node)

break;

if (!of_device_is_available(state_node))

continue;

if (i == CPUIDLE_STATE_MAX) {

pr_warn("%s: cpuidle states reached max possible\n",

__func__);

break;

}

states[state_count].wfi_flags = 0;

//3.里从设备树中读取C state自定义的信息

if (of_property_read_bool(state_node, "ti,idle-wkup-m3"))

states[state_count].wfi_flags |= WFI_FLAG_WAKE_M3 | WFI_FLAG_FLUSH_CACHE;

state_count++;

}

//4.为全局的idle_states[]数组申请空间，该变量中保存了在idle状态下的一些特性

idle_states = kcalloc(state_count, sizeof(*idle_states), GFP_KERNEL);

if (!idle_states)

return -ENOMEM;

for (i = 1; i < state_count; i++)

idle_states[i].wfi_flags = states[i].wfi_flags;

return 0;

}

omap2平台的设备树信息如下

cpus {

#address-cells = <1>;

#size-cells = <0>;

cpu@0 {

compatible = "arm,cortex-a8";

enable-method = "ti,am3352";

device_type = "cpu";

reg = <0>;

operating-points-v2 = <&cpu0_opp_table>;

clocks = <&dpll_mpu_ck>;

clock-names = "cpu";

clock-latency = <300000>; /* From omap-cpufreq driver */

cpu-idle-states = <&mpu_gate>; //只支持一个C state

};

idle-states {

mpu_gate: mpu_gate {

compatible = "arm,idle-state";

entry-latency-us = <40>;

exit-latency-us = <90>;

min-residency-us = <300>;

ti,idle-wkup-m3;

};

2.5 进入指定级别的C state

由上面对arm_enter_idle_state函数的分析可知，在cpuidle_arm中，cpu进入不同等级的C state对应的接口为

当idx为0时，调用cpu_do_idle
当idx不为0时，调用arm_cpuidle_suspend

其中：idx表示C state的级别

2.5.1 cpu_do_idle - 执行wfi指令进入idle 0

arm32平台实现如下：

函数位置：W:\opensource\linux-5.10.61\arch\arm\include\asm\glue-proc.h

#ifdef CONFIG_CPU_V7M

# ifdef CPU_NAME

# undef MULTI_CPU

# define MULTI_CPU

# else

# define CPU_NAME cpu_v7m

# endif

#endif

#define cpu_do_idle __glue(CPU_NAME,_do_idle)

以cpu_v7m为例，此时cpu_do_idle为cpu_v7m_do_idle

W:\opensource\linux-5.10.61\arch\arm\mm\proc-v7m.S

* cpu_v7m_do_idle()

* Idle the processor (eg, wait for interrupt).

* IRQs are already disabled.

ENTRY(cpu_v7m_do_idle)

wfi //wfi指令进入0级C state

ret lr //返回到睡眠位置继续执行

ENDPROC(cpu_v7m_do_idle)

2.5.2 arm_cpuidle_suspend - 进入指定级别的idle

arm_cpuidle_suspend直接调用操作函数集中的cpu_suspend回调函数，arm32和arm64实现还有点不一样，下面分情况介绍

2.5.2.1 ARM64

由前面的分析可知，任何arm64平台都不会通过下面接口进入idle，因为ops中就没有对cpu_suspend接口的定义

文件位置：W:\opensource\linux-5.10.61\arch\arm64\kernel\cpuidle.c

/**

* arm_cpuidle_suspend() - function to enter a low-power idle state

* @arg: argument to pass to CPU suspend operations

* Return: 0 on success, -EOPNOTSUPP if CPU suspend hook not initialized, CPU

* operations back-end error code otherwise.

int arm_cpuidle_suspend(int index)

{

int cpu = smp_processor_id();

const struct cpu_operations *ops = get_cpu_ops(cpu);

//进入指定的睡眠等级

return ops->cpu_suspend(index);

}

2.5.2.2 ARM32

由上面的分析可知，只有omap2中的pm33xx平台会走该接口进入idle

疑问：其他arm32怎么进入不同等级的idle呢？难道arm32就是不支持多等级的idle吗？？？

W:\opensource\linux-5.10.61\arch\arm\kernel\cpuidle.c

/**

* arm_cpuidle_suspend() - function to enter low power idle states

* @index: an integer used as an identifier for the low level PM callbacks

* This function calls the underlying arch specific low level PM code as

* registered at the init time.

* Returns the result of the suspend callback.

int arm_cpuidle_suspend(int index)

{

int cpu = smp_processor_id();

return cpuidle_ops[cpu].suspend(index);

}

如下，下面我们来分析一下amx3_idle_enter，看看arm32到底是怎样进入不同的idle等级的

W:\opensource\linux-5.10.61\arch\arm\mach-omap2\pm33xx-core.c

static struct cpuidle_ops amx3_cpuidle_ops __initdata = {

.init = amx3_idle_init,

.suspend = amx3_idle_enter,

};

CPUIDLE_METHOD_OF_DECLARE(pm33xx_idle, "ti,am3352", &amx3_cpuidle_ops);

CPUIDLE_METHOD_OF_DECLARE(pm43xx_idle, "ti,am4372", &amx3_cpuidle_ops);

2.5.2.2.1 amx3_idle_enter - 进入指定级别的C state

下面属于扩展内容，我们来看一下在这个平台上，所谓的"不同等级的C state"究竟是个啥玩意

static int amx3_idle_enter(

unsigned long index) //要进入的C state

{

//1.根据index，找到指定级别的C state对应的数据结构

struct amx3_idle_state *idle_state = &idle_states[index];

if (!idle_state)

return -EINVAL;

//2.idle_fn完成进入idle函数，这个函数在哪赋值呢

if (idle_fn)

idle_fn(idle_state->wfi_flags);

return 0;

}

上面的idle_fn在哪被初始化？？？

2.5.2.2.2 am33xx_pm_probe

如下，实际是在pm33xx驱动初始化时被设置

文件位置：U:\linux-5.10.61\drivers\soc\ti\pm33xx.c

static int am33xx_pm_probe(struct platform_device *pdev)

{

struct device *dev = &pdev->dev;

int ret;

if (!of_machine_is_compatible("ti,am33xx") &&

!of_machine_is_compatible("ti,am43"))

return -ENODEV;

pm_ops = dev->platform_data;

...

//1.获取可执行代码放在哪里

pm_sram = pm_ops->get_sram_addrs();

...

//2.拷贝回调函数对应的二进制代码段

ret = am33xx_push_sram_idle();

...

//3.初始化

ret = pm_ops->init(am33xx_do_sram_idle);

if (ret) {

dev_err(dev, "Unable to call core pm init!\n");

ret = -ENODEV;

goto err_put_wkup_m3_ipc;

}

return 0;

}

2.5.2.2.3 am33xx_ops - 全局的pm_ops指向该结构

上面的pm_ops指向下面的ops，在am33xx_suspend_init中对上面的idle_fn进行了设置，最终idle_fn会被设置为am33xx_do_sram_idle

static struct am33xx_pm_platform_data am33xx_ops = {

.init = am33xx_suspend_init,

.deinit = amx3_suspend_deinit,

.soc_suspend = am33xx_suspend,

.cpu_suspend = am33xx_cpu_suspend,

.begin_suspend = amx3_begin_suspend,

.finish_suspend = amx3_finish_suspend,

.get_sram_addrs = amx3_get_sram_addrs,

.save_context = am33xx_save_context,

.restore_context = am33xx_restore_context,

.check_off_mode_enable = am33xx_check_off_mode_enable,

};

2.5.2.2.4 am33xx_do_sram_idle - 全局的idle_fn，进入不同等级的C state

static int am33xx_do_sram_idle(u32 wfi_flags)

{

int ret = 0;

if (!m3_ipc || !pm_ops)

return 0;

if (wfi_flags & WFI_FLAG_WAKE_M3)

ret = m3_ipc->ops->prepare_low_power(m3_ipc, WKUP_M3_IDLE);

return pm_ops->cpu_suspend(am33xx_do_wfi_sram, wfi_flags);

}

2.5.2.2.5 am33xx_cpu_suspend

static int am33xx_cpu_suspend(

int (*fn)(unsigned long), //进入指定C state时的回调函数

unsigned long args) //这里传入的是wfi_flags

{

int ret = 0;

if (omap_irq_pending() || need_resched())

return ret;

//1.cpu_suspend函数中实际是调用了传递的fn回调函数，完成进入指定等级的C state

// 注意：这里传入的参数args为wfi_flags，该参数用于指示在执行wfi指令之前需要

// 关闭哪些外设，用这个方法区分不同等级的C state

ret = cpu_suspend(args, fn);

return ret;

}

2.5.2.2.6 cpu_suspend - 调用指定的回调函数，进入指定的C state

arm32实现如下：

U:\linux-5.10.61\arch\arm\kernel\suspend.c

#ifdef CONFIG_MMU

int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))

{

struct mm_struct *mm = current->active_mm;

u32 __mpidr = cpu_logical_map(smp_processor_id());

int ret;

if (!idmap_pgd)

return -EINVAL;

* Function graph tracer state gets incosistent when the kernel

* calls functions that never return (aka suspend finishers) hence

* disable graph tracing during their execution.

pause_graph_tracing();

* Provide a temporary page table with an identity mapping for

* the MMU-enable code, required for resuming. On successful

* resume (indicated by a zero return code), we need to switch

* back to the correct page tables.

//调用__cpu_suspend函数进入idle睡眠

ret = __cpu_suspend(arg, fn, __mpidr);

unpause_graph_tracing();

if (ret == 0) {

cpu_switch_mm(mm->pgd, mm);

local_flush_bp_all();

local_flush_tlb_all();

check_other_bugs();

}

return ret;

}

#else

int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))

{

u32 __mpidr = cpu_logical_map(smp_processor_id());

int ret;

pause_graph_tracing();

//调用__cpu_suspend函数进入idle睡眠

ret = __cpu_suspend(arg, fn, __mpidr);

unpause_graph_tracing();

return ret;

}

#define idmap_pgd NULL

#endif

2.5.2.2.7 __cpu_suspend

函数位置：U:\linux-5.10.61\arch\arm\kernel\sleep.S

* Save CPU state for a suspend. This saves the CPU general purpose

* registers, and allocates space on the kernel stack to save the CPU

* specific registers and some other data for resume.

* r0 = suspend function arg0

* r1 = suspend function

* r2 = MPIDR value the resuming CPU will use

ENTRY(__cpu_suspend)

stmfd sp!, {r4 - r11, lr}

#ifdef MULTI_CPU

ldr r10, =processor

ldr r4, [r10, #CPU_SLEEP_SIZE] @ size of CPU sleep state

#else

ldr r4, =cpu_suspend_size

#endif

mov r5, sp @ current virtual SP

add r4, r4, #12 @ Space for pgd, virt sp, phys resume fn

sub sp, sp, r4 @ allocate CPU state on stack

ldr r3, =sleep_save_sp

stmfd sp!, {r0, r1} @ save suspend func arg and pointer

ldr r3, [r3, #SLEEP_SAVE_SP_VIRT]

ALT_SMP(ldr r0, =mpidr_hash)

ALT_UP_B(1f)

/* This ldmia relies on the memory layout of the mpidr_hash struct */

ldmia r0, {r1, r6-r8} @ r1 = mpidr mask (r6,r7,r8) = l[0,1,2] shifts

compute_mpidr_hash r0, r6, r7, r8, r2, r1

add r3, r3, r0, lsl #2

1: mov r2, r5 @ virtual SP

mov r1, r4 @ size of save block

add r0, sp, #8 @ pointer to save block

bl __cpu_suspend_save

badr lr, cpu_suspend_abort

ldmfd sp!, {r0, pc} @ call suspend fn

ENDPROC(__cpu_suspend)

2.5.2.2.8 am33xx_do_wfi_sram - 进入指定级别的C state

am33xx_do_wfi_sram完成真正的进入指定级别的C state，但是这是一个全局函数指针

U:\linux-5.10.61\drivers\soc\ti\pm33xx.c

static int (*am33xx_do_wfi_sram)(unsigned long unused);

在am33xx_push_sram_idle中完成对全局函数指针am33xx_do_wfi_sram的赋值，调用关系如下：

am33xx_pm_probe -> am33xx_push_sram_idle

static int am33xx_push_sram_idle(void)

{

...

am33xx_do_wfi_sram = sram_exec_copy(sram_pool, (void *)ocmcram_location,

pm_sram->do_wfi,

*pm_sram->do_wfi_sz);

if (!am33xx_do_wfi_sram) {

dev_err(pm33xx_dev,

"PM: %s: am33xx_do_wfi copy to sram failed\n",

__func__);

return -ENODEV;

}

...

}

sram_exec_copy函数声明如下，我们知道是从pm_sram->do_wfi向ocmcram_location拷贝可执行代码

void *sram_exec_copy(struct gen_pool *pool, void *dst, void *src, size_t size);

2.5.2.2.9 amx3_get_sram_addrs - 计算pm_sram，即可执行代码存放位置

由上面分析的probe函数可知，pm_sram是通过get_sram_addrs获得的

pm_sram = pm_ops->get_sram_addrs();

get_sram_addrs回调函数为amx3_get_sram_addrs，实现如下：

一看就是平台相关，挑一个分析，此处我们选择am33xx_pm_sram继续分析

static struct am33xx_pm_sram_addr *amx3_get_sram_addrs(void)

{

if (soc_is_am33xx())

return &am33xx_pm_sram;

else if (soc_is_am437x())

return &am43xx_pm_sram;

else

return NULL;

}

2.5.2.2.10 am33xx_pm_sram - 在汇编中描述一个am33xx_pm_sram_addr类型结构

am33xx_pm_sram定义如下

U:\linux-5.10.61\arch\arm\mach-omap2\pm.h

extern struct am33xx_pm_sram_addr am33xx_pm_sram;

实际上am33xx_pm_sram对应下面汇编，下面实际就是定义一个数据结构，每个是一个函数指针

U:\linux-5.10.61\arch\arm\mach-omap2\sleep33xx.S

ENTRY(am33xx_pm_sram)

.word am33xx_do_wfi

.word am33xx_do_wfi_sz

.word am33xx_resume_offset

.word am33xx_emif_sram_table

.word am33xx_pm_ro_sram_data

resume_addr:

.word cpu_resume - PAGE_OFFSET + 0x80000000

上面数据结构的类型就是am33xx_pm_sram_addr，所以上面在调用pm_sram->do_wfi进入指定级别的C state，实际就是调用am33xx_do_wfi

struct am33xx_pm_sram_addr {

void (*do_wfi)(void);

unsigned long *do_wfi_sz;

unsigned long *resume_offset;

unsigned long *emif_sram_table;

unsigned long *ro_sram_data;

unsigned long resume_address;

};

2.5.2.2.11 am33xx_do_wfi - 进入不同等级的C state，实际就是在wfi之前关闭片上外设、刷cache等操作

由上面分析可知，am33xx_do_wfi真正的实现了进入不同等级的C state，在omap2平台上，进入不同等级的C state实际就是根据传进来的wfi_flags参数，决定在执行wfi指令之前，要执行哪些操作，比如要关闭哪些片上外设、是否需要flash cache等操作

注意：这只是arm32的omap2这个平台对"不同等级的C state"的定义，不同的平台由不同的定义

可用的wfi_flasg参数如下

* WFI Flags for sleep code control

* These flags allow PM code to exclude certain operations from happening

* in the low level ASM code found in sleep33xx.S and sleep43xx.S

* WFI_FLAG_FLUSH_CACHE: Flush the ARM caches and disable caching. Only

* needed when MPU will lose context.

* WFI_FLAG_SELF_REFRESH: Let EMIF place DDR memory into self-refresh and

* disable EMIF.

* WFI_FLAG_SAVE_EMIF: Save context of all EMIF registers and restore in

* resume path. Only needed if PER domain loses context

* and must also have WFI_FLAG_SELF_REFRESH set.

* WFI_FLAG_WAKE_M3: Disable MPU clock or clockdomain to cause wkup_m3 to

* execute when WFI instruction executes.

* WFI_FLAG_RTC_ONLY: Configure the RTC to enter RTC+DDR mode.

#define WFI_FLAG_FLUSH_CACHE BIT(0)

#define WFI_FLAG_SELF_REFRESH BIT(1)

#define WFI_FLAG_SAVE_EMIF BIT(2)

#define WFI_FLAG_WAKE_M3 BIT(3)

#define WFI_FLAG_RTC_ONLY BIT(4)

如下：

U:\linux-5.10.61\arch\arm\mach-omap2\sleep33xx.S

ENTRY(am33xx_do_wfi)

stmfd sp!, {r4 - r11, lr} @ save registers on stack

/* Save wfi_flags arg to data space */

//1.首先将wfi_flags参数保存在r4中

mov r4, r0

//2.下面的操作只是为了保存wfi_flags，因为后面调用的一些操作可能或破坏r4

//2.1 am33xx_pm_ro_sram_data是在汇编中预留的一段空间

adr r3, am33xx_pm_ro_sram_data

//2.2 从am33xx_pm_ro_sram_data中读出amx3_pm_sram_data_virt

// amx3_pm_sram_data_virt指向的是一个am33xx_pm_sram_data类型的结构

ldr r2, [r3, #AMX3_PM_RO_SRAM_DATA_VIRT_OFFSET]

//2.3 将上面传进来的wfi_flags存入am33xx_pm_sram_data->wfi_flags中去

str r4, [r2, #AMX3_PM_WFI_FLAGS_OFFSET]

/* Only flush cache is we know we are losing MPU context */

//3.下面根据wfi_flags，执行不同的操作

tst r4, #WFI_FLAG_FLUSH_CACHE

beq cache_skip_flush

* Flush all data from the L1 and L2 data cache before disabling

* SCTLR.C bit.

ldr r1, kernel_flush

blx r1

* Clear the SCTLR.C bit to prevent further data cache

* allocation. Clearing SCTLR.C would make all the data accesses

* strongly ordered and would not hit the cache.

mrc p15, 0, r0, c1, c0, 0

bic r0, r0, #(1 << 2) @ Disable the C bit

mcr p15, 0, r0, c1, c0, 0

isb

* Invalidate L1 and L2 data cache.

ldr r1, kernel_flush

blx r1

//4.下面的操作时恢复上面保存的wfi_flags参数

adr r3, am33xx_pm_ro_sram_data

ldr r2, [r3, #AMX3_PM_RO_SRAM_DATA_VIRT_OFFSET]

ldr r4, [r2, #AMX3_PM_WFI_FLAGS_OFFSET]

cache_skip_flush:

/* Check if we want self refresh */

//5.继续根据wfi_flags，执行相应的操作

tst r4, #WFI_FLAG_SELF_REFRESH

beq emif_skip_enter_sr

adr r9, am33xx_emif_sram_table

ldr r3, [r9, #EMIF_PM_ENTER_SR_OFFSET]

blx r3

emif_skip_enter_sr:

/* Only necessary if PER is losing context */

//6.继续根据wfi_flags，执行相应的操作

tst r4, #WFI_FLAG_SAVE_EMIF

beq emif_skip_save

ldr r3, [r9, #EMIF_PM_SAVE_CONTEXT_OFFSET]

blx r3

emif_skip_save:

/* Only can disable EMIF if we have entered self refresh */

//7.继续根据wfi_flags，执行相应的操作

tst r4, #WFI_FLAG_SELF_REFRESH

beq emif_skip_disable

/* Disable EMIF */

ldr r1, virt_emif_clkctrl

ldr r2, [r1]

bic r2, r2, #AM33XX_CM_CLKCTRL_MODULEMODE_DISABLE

str r2, [r1]

ldr r1, virt_emif_clkctrl

wait_emif_disable:

ldr r2, [r1]

mov r3, #AM33XX_CM_CLKCTRL_MODULESTATE_DISABLED

cmp r2, r3

bne wait_emif_disable

emif_skip_disable:

//8.继续根据wfi_flags，执行相应的操作

tst r4, #WFI_FLAG_WAKE_M3

beq wkup_m3_skip

* For the MPU WFI to be registered as an interrupt

* to WKUP_M3, MPU_CLKCTRL.MODULEMODE needs to be set

* to DISABLED

ldr r1, virt_mpu_clkctrl

ldr r2, [r1]

bic r2, r2, #AM33XX_CM_CLKCTRL_MODULEMODE_DISABLE

str r2, [r1]

wkup_m3_skip:

* Execute an ISB instruction to ensure that all of the

* CP15 register changes have been committed.

isb

* Execute a barrier instruction to ensure that all cache,

* TLB and branch predictor maintenance operations issued

* have completed.

dsb

dmb

* Execute a WFI instruction and wait until the

* STANDBYWFI output is asserted to indicate that the

* CPU is in idle and low power state. CPU can specualatively

* prefetch the instructions so add NOPs after WFI. Thirteen

* NOPs as per Cortex-A8 pipeline.

//9.执行wfi指令，进入idle

wfi

nop

/* We come here in case of an abort due to a late interrupt */

/* Set MPU_CLKCTRL.MODULEMODE back to ENABLE */

ldr r1, virt_mpu_clkctrl

mov r2, #AM33XX_CM_CLKCTRL_MODULEMODE_ENABLE

str r2, [r1]

/* Re-enable EMIF */

ldr r1, virt_emif_clkctrl

mov r2, #AM33XX_CM_CLKCTRL_MODULEMODE_ENABLE

str r2, [r1]

wait_emif_enable:

ldr r3, [r1]

cmp r2, r3

bne wait_emif_enable

/* Only necessary if PER is losing context */

//10.下面根据wfi_flags，恢复相应的操作

tst r4, #WFI_FLAG_SELF_REFRESH

beq emif_skip_exit_sr_abt

adr r9, am33xx_emif_sram_table

ldr r1, [r9, #EMIF_PM_ABORT_SR_OFFSET]

blx r1

emif_skip_exit_sr_abt:

//11.继续根据wfi_flags，恢复相应的操作

tst r4, #WFI_FLAG_FLUSH_CACHE

beq cache_skip_restore

* Set SCTLR.C bit to allow data cache allocation

mrc p15, 0, r0, c1, c0, 0

orr r0, r0, #(1 << 2) @ Enable the C bit

mcr p15, 0, r0, c1, c0, 0

isb

cache_skip_restore:

/* Let the suspend code know about the abort */

//12.执行成功返回1，也就是从idle中退出后返回1

mov r0, #1

ldmfd sp!, {r4 - r11, pc} @ restore regs and return

ENDPROC(am33xx_do_wfi)

2.5.2.2.12 am33xx_pm_ro_sram_data全局变量在哪定义

在汇编中预留了一段struct am33xx_pm_ro_sram_data空间

U:\linux-5.10.61\arch\arm\mach-omap2\sleep33xx.S

.align 3

ENTRY(am33xx_pm_ro_sram_data)

.space AMX3_PM_RO_SRAM_DATA_SIZE

其中，AMX3_PM_RO_SRAM_DATA_SIZE定义如下，定义方法在下面讲解

U:\linux-5.10.61\arch\arm\mach-omap2\pm-asm-offsets.c

int main(void)

{

ti_emif_asm_offsets();

DEFINE(AMX3_PM_WFI_FLAGS_OFFSET,

offsetof(struct am33xx_pm_sram_data, wfi_flags));

DEFINE(AMX3_PM_L2_AUX_CTRL_VAL_OFFSET,

offsetof(struct am33xx_pm_sram_data, l2_aux_ctrl_val));

DEFINE(AMX3_PM_L2_PREFETCH_CTRL_VAL_OFFSET,

offsetof(struct am33xx_pm_sram_data, l2_prefetch_ctrl_val));

DEFINE(AMX3_PM_SRAM_DATA_SIZE, sizeof(struct am33xx_pm_sram_data));

BLANK();

DEFINE(AMX3_PM_RO_SRAM_DATA_VIRT_OFFSET,

offsetof(struct am33xx_pm_ro_sram_data, amx3_pm_sram_data_virt));

DEFINE(AMX3_PM_RO_SRAM_DATA_PHYS_OFFSET,

offsetof(struct am33xx_pm_ro_sram_data, amx3_pm_sram_data_phys));

DEFINE(AMX3_PM_RTC_BASE_VIRT_OFFSET,

offsetof(struct am33xx_pm_ro_sram_data, rtc_base_virt));

DEFINE(AMX3_PM_RO_SRAM_DATA_SIZE,

sizeof(struct am33xx_pm_ro_sram_data));

return 0;

}

其中am33xx_pm_ro_sram_data和am33xx_pm_sram_data数据结构定义如下：

struct am33xx_pm_ro_sram_data {

u32 amx3_pm_sram_data_virt;

u32 amx3_pm_sram_data_phys;

void __iomem *rtc_base_virt;

} __packed __aligned(8);

struct am33xx_pm_sram_data {

u32 wfi_flags;

u32 l2_aux_ctrl_val;

u32 l2_prefetch_ctrl_val;

} __packed __aligned(8);

扩展：

注意上面宏定义的方法，在c中定义的宏，在汇编中使用，实际上是利用了预编译过程中会生成.s文件。对应的makefile如下。上面的main为"虚拟函数"，这个函数永远不会被调用，但是汇编和编译器却是能看到的

U:\linux-5.10.61\arch\arm\mach-omap2\Makefile

$(obj)/pm-asm-offsets.h: $(obj)/pm-asm-offsets.s FORCE

$(call filechk,offsets,__TI_PM_ASM_OFFSETS_H__)

$(obj)/sleep33xx.o $(obj)/sleep43xx.o: $(obj)/pm-asm-offsets.h

targets += pm-asm-offsets.s

clean-files += pm-asm-offsets.h

三、示例二：cpuidle-big_little - ARM32平台

使能了CONFIG_ARM_BIG_LITTLE_CPUIDLE该宏后启用该驱动

文件位置：U:\linux-5.10.61\drivers\cpuidle\cpuidle-big_little.c

3.1 大小核对应的cpuidle_driver

在cpuidle-big_little中，不管是大核还是小核，都只有两个C state，我们暂且称之为idle0和idle1

3.1.1 bl_idle_little_driver - 小核

* NB: Owing to current menu governor behaviour big and LITTLE

* index 1 states have to define exit_latency and target_residency for

* cluster state since, when all CPUs in a cluster hit it, the cluster

* can be shutdown. This means that when a single CPU enters this state

* the exit_latency and target_residency values are somewhat overkill.

* There is no notion of cluster states in the menu governor, so CPUs

* have to define CPU states where possibly the cluster will be shutdown

* depending on the state of other CPUs. idle states entry and exit happen

* at random times; however the cluster state provides target_residency

* values as if all CPUs in a cluster enter the state at once; this is

* somewhat optimistic and behaviour should be fixed either in the governor

* or in the MCPM back-ends.

* To make this driver 100% generic the number of states and the exit_latency

* target_residency values must be obtained from device tree bindings.

* exit_latency: refers to the TC2 vexpress test chip and depends on the

* current cluster operating point. It is the time it takes to get the CPU

* up and running when the CPU is powered up on cluster wake-up from shutdown.

* Current values for big and LITTLE clusters are provided for clusters

* running at default operating points.

* target_residency: it is the minimum amount of time the cluster has

* to be down to break even in terms of power consumption. cluster

* shutdown has inherent dynamic power costs (L2 writebacks to DRAM

* being the main factor) that depend on the current operating points.

* The current values for both clusters are provided for a CPU whose half

* of L2 lines are dirty and require cleaning to DRAM, and takes into

* account leakage static power values related to the vexpress TC2 testchip.

static struct cpuidle_driver bl_idle_little_driver = {

.name = "little_idle",

.owner = THIS_MODULE,

.states[0] = ARM_CPUIDLE_WFI_STATE, //进入idle0的方法

.states[1] = {

.enter = bl_enter_powerdown, //进入idle1的方法

.exit_latency = 700,

.target_residency = 2500,

.flags = CPUIDLE_FLAG_TIMER_STOP,

.name = "C1",

.desc = "ARM little-cluster power down",

.state_count = 2,

};

3.1.1.1 ARM_CPUIDLE_WFI_STATE - 执行wfi指令进入0级idle

/* Common ARM WFI state */

#define ARM_CPUIDLE_WFI_STATE_PWR(p) {\

.enter = arm_cpuidle_simple_enter,\ //进入该idle0的方法

.exit_latency = 1,\

.target_residency = 1,\

.power_usage = p,\

.name = "WFI",\

.desc = "ARM WFI",\

}

* in case power_specified == 1, give a default WFI power value needed

* by some governors

#define ARM_CPUIDLE_WFI_STATE ARM_CPUIDLE_WFI_STATE_PWR(UINT_MAX)

3.1.2 bl_idle_big_driver - 大核

static struct cpuidle_driver bl_idle_big_driver = {

.name = "big_idle",

.owner = THIS_MODULE,

.states[0] = ARM_CPUIDLE_WFI_STATE, //进入idle0的方法

.states[1] = {

.enter = bl_enter_powerdown, //进入idle1的方法

.exit_latency = 500,

.target_residency = 2000,

.flags = CPUIDLE_FLAG_TIMER_STOP,

.name = "C1",

.desc = "ARM big-cluster power down",

.state_count = 2,

};

3.2 bl_idle_init - 驱动初始化

static int __init bl_idle_init(void)

{

int ret;

//1.找到设备树的根节点

struct device_node *root = of_find_node_by_path("/");

const struct of_device_id *match_id;

if (!root)

return -ENODEV;

* Initialize the driver just for a compliant set of machines

//2.只有match匹配上才允许使用bl模块

match_id = of_match_node(compatible_machine_match, root);

of_node_put(root);

if (!match_id)

return -ENODEV;

if (!mcpm_is_available())

return -EUNATCH;

* For now the differentiation between little and big cores

* is based on the part number. A7 cores are considered little

* cores, A15 are considered big cores. This distinction may

* evolve in the future with a more generic matching approach.

//3.初始化driver，设置driver->cpumask

// 小核A7，大核A15

ret = bl_idle_driver_init(&bl_idle_little_driver, ARM_CPU_PART_CORTEX_A7);

if (ret)

return ret;

ret = bl_idle_driver_init(&bl_idle_big_driver, ARM_CPU_PART_CORTEX_A15);

if (ret)

goto out_uninit_little;

/* Start at index 1, index 0 standard WFI */

//4.解析设备树中的C state信息

// 这里传入的参数为1，由上面注释可知，从设备树解析出来的

// C state信息是给idx>=1使用的，idx=0是wfi预留的

ret = dt_init_idle_driver(&bl_idle_big_driver, bl_idle_state_match, 1);

if (ret < 0)

goto out_uninit_big;

/* Start at index 1, index 0 standard WFI */

ret = dt_init_idle_driver(&bl_idle_little_driver, bl_idle_state_match, 1);

if (ret < 0)

goto out_uninit_big;

//5.注册driver

ret = cpuidle_register(&bl_idle_little_driver, NULL);

if (ret)

goto out_uninit_big;

ret = cpuidle_register(&bl_idle_big_driver, NULL);

if (ret)

goto out_unregister_little;

return 0;

out_unregister_little:

cpuidle_unregister(&bl_idle_little_driver);

out_uninit_big:

kfree(bl_idle_big_driver.cpumask);

out_uninit_little:

kfree(bl_idle_little_driver.cpumask);

return ret;

}

device_initcall(bl_idle_init);

3.2.1 compatible_machine_match - match表

static const struct of_device_id compatible_machine_match[] = {

{ .compatible = "arm,vexpress,v2p-ca15_a7" },

{ .compatible = "samsung,exynos5420" },

{ .compatible = "samsung,exynos5800" },

{},

};

3.2.2 bl_idle_state_match

static const struct of_device_id bl_idle_state_match[] __initconst = {

{ .compatible = "arm,idle-state",

.data = bl_enter_powerdown },

{ },

};

3.2.3 bl_idle_driver_init

static int __init bl_idle_driver_init(struct cpuidle_driver *drv, int part_id)

{

struct cpumask *cpumask;

int cpu;

cpumask = kzalloc(cpumask_size(), GFP_KERNEL);

if (!cpumask)

return -ENOMEM;

for_each_possible_cpu(cpu)

if (smp_cpuid_part(cpu) == part_id)

cpumask_set_cpu(cpu, cpumask);

drv->cpumask = cpumask;

return 0;

}

3.3 进入指定级别的C state

3.3.1 arm_cpuidle_simple_enter - 执行wfi进入idle0

/**

* arm_cpuidle_simple_enter() - a wrapper to cpu_do_idle()

* @dev: not used

* @drv: not used

* @index: not used

* A trivial wrapper to allow the cpu_do_idle function to be assigned as a

* cpuidle callback by matching the function signature.

* Returns the index passed as parameter

int arm_cpuidle_simple_enter(

struct cpuidle_device *dev,

struct cpuidle_driver *drv,

int index) //wfi不关注等级，该参数被忽略

{

cpu_do_idle();

return index;

}

3.3.1.1 cpu_do_idle - 不同架构有自己的idle方法

W:\opensource\linux-5.10.61\arch\arm\include\asm\glue-proc.h

#define cpu_do_idle __glue(CPU_NAME,_do_idle)

我们以cpu_v7m为例，则CPU_NAME定义如下：

#ifdef CONFIG_CPU_V7M

# ifdef CPU_NAME

# undef MULTI_CPU

# define MULTI_CPU

# else

# define CPU_NAME cpu_v7m

# endif

#endif

此时cpu_do_idle为cpu_v7m_do_idle

W:\opensource\linux-5.10.61\arch\arm\mm\proc-v7m.S

* cpu_v7m_do_idle()

* Idle the processor (eg, wait for interrupt).

* IRQs are already disabled.

ENTRY(cpu_v7m_do_idle)

wfi //直接执行wfi进入睡眠

ret lr //醒来后，返回原来的位置继续执行

ENDPROC(cpu_v7m_do_idle)

更多接口参见

3.3.2 bl_enter_powerdown -关闭cpu和cluster的电源，进入idle1

由上面driver的定义可知，在cpuidle-big_little中，不管是大核还是小核，都只有两个C state，我们暂且称之为idle0和idle1，该函数是进入idle1，因为固定为idle1，所以这里传入的idx并没有使用

/**

* bl_enter_powerdown - Programs CPU to enter the specified state

* @dev: cpuidle device

* @drv: The target state to be programmed

* @idx: state index

* Called from the CPUidle framework to program the device to the

* specified target state selected by the governor.

static int bl_enter_powerdown(

struct cpuidle_device *dev,

struct cpuidle_driver *drv,

int idx) //要进入的idle级别，但是在这个函数中并没有用

{

cpu_pm_enter();

//调用bl_powerdown_finisher函数进入指定的idle等级

//注意：下面传入的0在bl_powerdown_finisher也是没有使用的，因为固定进入idle1

cpu_suspend(0, bl_powerdown_finisher);

/* signals the MCPM core that CPU is out of low power state */

mcpm_cpu_powered_up();

cpu_pm_exit();

return idx;

}

3.3.2.1 bl_powerdown_finisher -关闭cpu和cluster的电源，进入idle1

* notrace prevents trace shims from getting inserted where they

* should not. Global jumps and ldrex/strex must not be inserted

* in power down sequences where caches and MMU may be turned off.

static int notrace bl_powerdown_finisher(

unsigned long arg) //传入的参数并没有使用

{

/* MCPM works with HW CPU identifiers */

unsigned int mpidr = read_cpuid_mpidr();

unsigned int cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);

unsigned int cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);

//1.设置cpu从idle退出后，从哪开始执行

mcpm_set_entry_vector(cpu, cluster, cpu_resume);

//2.进入idle1

mcpm_cpu_suspend();

/* return value != 0 means failure */

return 1;

}

3.3.2.2 mcpm_cpu_suspend

mcpm是Multi-Cluster PM的缩写

void mcpm_cpu_suspend(void)

{

if (WARN_ON_ONCE(!platform_ops))

return;

/* Some platforms might have to enable special resume modes, etc. */

//1.一些平台可能需要执行一些准备工作

if (platform_ops->cpu_suspend_prepare) {

unsigned int mpidr = read_cpuid_mpidr();

unsigned int cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);

unsigned int cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);

arch_spin_lock(&mcpm_lock);

platform_ops->cpu_suspend_prepare(cpu, cluster);

arch_spin_unlock(&mcpm_lock);

}

//2.进入idle

mcpm_cpu_power_down();

}

3.3.2.2.1 tc2_pm_cpu_suspend_prepare - 设置cpu唤醒时从哪取指执行

在vexpress平台中cpu_suspend_prepare回调函数实现如下。该函数将"cpu从idle退出时要执行的地址"写入一个寄存器，cpu被唤醒后，会从这个寄存器里面读取地址，并在指定的地址取指执行

U:\linux-5.10.61\arch\arm\mach-vexpress\tc2_pm.c

static void tc2_pm_cpu_suspend_prepare(unsigned int cpu, unsigned int cluster)

{

ve_spc_set_resume_addr(cluster, cpu, __pa_symbol(mcpm_entry_point));

}

3.3.2.2.2 ve_spc_set_resume_addr - 设置cpu唤醒时从哪取指执行

/**

* ve_spc_set_resume_addr() - set the jump address used for warm boot

* @cluster: mpidr[15:8] bitfield describing cluster affinity level

* @cpu: mpidr[7:0] bitfield describing cpu affinity level

* @addr: physical resume address

void ve_spc_set_resume_addr(u32 cluster, u32 cpu, u32 addr)

{

void __iomem *baseaddr;

if (cluster >= MAX_CLUSTERS)

return;

//不同的cpu架构，读取的寄存器不一样

if (cluster_is_a15(cluster))

baseaddr = info->baseaddr + A15_BX_ADDR0 + (cpu << 2);

else

baseaddr = info->baseaddr + A7_BX_ADDR0 + (cpu << 2);

//将要唤醒的地址存取寄存器

writel_relaxed(addr, baseaddr);

}

3.3.2.3 mcpm_cpu_power_down - 关闭cpu和cluster的电源，进入idle1

void mcpm_cpu_power_down(void)

{

unsigned int mpidr, cpu, cluster;

bool cpu_going_down, last_man;

phys_reset_t phys_reset;

//1.读取当前cpu的mpidr寄存器，并从mpidr中提取出cls和cpu的值

mpidr = read_cpuid_mpidr();

cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);

cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);

pr_debug("%s: cpu %u cluster %u\n", __func__, cpu, cluster);

if (WARN_ON_ONCE(!platform_ops))

return;

BUG_ON(!irqs_disabled());

setup_mm_for_reboot();

__mcpm_cpu_going_down(cpu, cluster);

arch_spin_lock(&mcpm_lock);

BUG_ON(__mcpm_cluster_state(cluster) != CLUSTER_UP);

//2.mcpm_cpu_use_count用于记录这个cpu的状态

// a) 0: 表示这个cpu一定down，也就是处于掉电状态

// b) 1: 表示这个cpu处于up状态，也就是上电状态

mcpm_cpu_use_count[cluster][cpu]--;

BUG_ON(mcpm_cpu_use_count[cluster][cpu] != 0 && mcpm_cpu_use_count[cluster][cpu] != 1);

cpu_going_down = !mcpm_cpu_use_count[cluster][cpu];

//3.判断这个cluster中是不是只有一个cpu了

last_man = mcpm_cluster_unused(cluster);

//4.给cluster或者cpu下电

if (last_man && __mcpm_outbound_enter_critical(cpu, cluster)) {

//4.1 如果当前这个cpu是这个cluster中最后一个cpu，这个cpu下电后不会

// 有其他cpu运行了，则需要同时把cpu和cluster的电源都关闭掉

platform_ops->cpu_powerdown_prepare(cpu, cluster);

platform_ops->cluster_powerdown_prepare(cluster);

arch_spin_unlock(&mcpm_lock);

platform_ops->cluster_cache_disable();

__mcpm_outbound_leave_critical(cluster, CLUSTER_DOWN);

} else {

//4.2 走入该分支，表示这个cpu下电后，cluster中还有其他

// cpu在运行，此时cluster不能下电

if (cpu_going_down)

platform_ops->cpu_powerdown_prepare(cpu, cluster);

arch_spin_unlock(&mcpm_lock);

* If cpu_going_down is false here, that means a power_up

* request raced ahead of us. Even if we do not want to

* shut this CPU down, the caller still expects execution

* to return through the system resume entry path, like

* when the WFI is aborted due to a new IRQ or the like..

* So let's continue with cache cleaning in all cases.

platform_ops->cpu_cache_disable();

}

__mcpm_cpu_down(cpu, cluster);

/* Now we are prepared for power-down, do it: */

//5.下面执行wfi指令进入睡眠状态

if (cpu_going_down)

wfi();

//6.走到这里表示已经从idle中退出来了

* It is possible for a power_up request to happen concurrently

* with a power_down request for the same CPU. In this case the

* CPU might not be able to actually enter a powered down state

* with the WFI instruction if the power_up request has removed

* the required reset condition. We must perform a re-entry in

* the kernel as if the power_up method just had deasserted reset

* on the CPU.

//7.调用cpu_reset汇编代码，完成这个cpu被唤醒后的准备工作

phys_reset = (phys_reset_t)(unsigned long)__pa_symbol(cpu_reset);

phys_reset(__pa_symbol(mcpm_entry_point), false);

/* should never get here */

BUG();

}

3.3.2.4 cpu_reset - cpu被唤醒后要执行的工作

W:\opensource\linux-5.10.61\arch\arm\include\asm\glue-proc.h

#define cpu_reset __glue(CPU_NAME,_reset)

我们依然以cpu_v7m为例

文件位置：W:\opensource\linux-5.10.61\arch\arm\mm\proc-v7m.S

* cpu_v7m_reset(loc)

* Perform a soft reset of the system. Put the CPU into the

* same state as it would be if it had been reset, and branch

* to what would be the reset vector.

* - loc - location to jump to for soft reset

.align 5

ENTRY(cpu_v7m_reset)

ret r0 //直接跳转到r0所指向的地址执行

ENDPROC(cpu_v7m_reset)

3.3.2.5 mcpm_entry_point - cpu退出idle时从这开始执行

W:\opensource\linux-5.10.61\arch\arm\common\mcpm_head.S

ENTRY(mcpm_entry_point)

ARM_BE8(setend be)

THUMB( badr r12, 1f )

THUMB( bx r12 )

THUMB( .thumb )

//第一步：判断要唤醒的cpu的id是否合法

mrc p15, 0, r0, c0, c0, 5 @ MPIDR

ubfx r9, r0, #0, #8 @ r9 = cpu

ubfx r10, r0, #8, #8 @ r10 = cluster

mov r3, #MAX_CPUS_PER_CLUSTER

//mla指令完成的工作：r4 <- r3 * r10 +r9

mla r4, r3, r10, r9 @ r4 = canonical CPU index

cmp r4, #(MAX_CPUS_PER_CLUSTER * MAX_NR_CLUSTERS)

blo 2f

/* We didn't expect this CPU. Try to cheaply make it quiet. */

//这里是永远不会进来的

1: wfi

wfe

b 1b

//走到这里表示这个cpu马上就要启动了

2: pr_dbg "kernel mcpm_entry_point\n"

* MMU is off so we need to get to various variables in a

* position independent way.

//由上面的注释可知，因为此时mmu是关闭的，我们在这里需要用一种方法获取变量的地址

//下面3f的位置实际就是内存池

adr r5, 3f

ldmia r5, {r0, r6, r7, r8, r11}

add r0, r5, r0 @ r0 = mcpm_entry_early_pokes

add r6, r5, r6 @ r6 = mcpm_entry_vectors

ldr r7, [r5, r7] @ r7 = mcpm_power_up_setup_phys

add r8, r5, r8 @ r8 = mcpm_sync

add r11, r5, r11 @ r11 = first_man_locks

@ Perform an early poke, if any

add r0, r0, r4, lsl #3

ldmia r0, {r0, r1}

teq r0, #0

strne r1, [r0]

mov r0, #MCPM_SYNC_CLUSTER_SIZE

mla r8, r0, r10, r8 @ r8 = sync cluster base

@ Signal that this CPU is coming UP:

mov r0, #CPU_COMING_UP

mov r5, #MCPM_SYNC_CPU_SIZE

mla r5, r9, r5, r8 @ r5 = sync cpu address

strb r0, [r5]

@ At this point, the cluster cannot unexpectedly enter the GOING_DOWN

@ state, because there is at least one active CPU (this CPU).

mov r0, #VLOCK_SIZE

mla r11, r0, r10, r11 @ r11 = cluster first man lock

mov r0, r11

mov r1, r9 @ cpu

bl vlock_trylock @ implies DMB

cmp r0, #0 @ failed to get the lock?

bne mcpm_setup_wait @ wait for cluster setup if so

ldrb r0, [r8, #MCPM_SYNC_CLUSTER_CLUSTER]

cmp r0, #CLUSTER_UP @ cluster already up?

bne mcpm_setup @ if not, set up the cluster

@ Otherwise, release the first man lock and skip setup:

mov r0, r11

bl vlock_unlock

b mcpm_setup_complete

mcpm_setup:

@ Control dependency implies strb not observable before previous ldrb.

@ Signal that the cluster is being brought up:

mov r0, #INBOUND_COMING_UP

strb r0, [r8, #MCPM_SYNC_CLUSTER_INBOUND]

dmb

@ Any CPU trying to take the cluster into CLUSTER_GOING_DOWN from this

@ point onwards will observe INBOUND_COMING_UP and abort.

@ Wait for any previously-pending cluster teardown operations to abort

@ or complete:

mcpm_teardown_wait:

ldrb r0, [r8, #MCPM_SYNC_CLUSTER_CLUSTER]

cmp r0, #CLUSTER_GOING_DOWN

bne first_man_setup

wfe

b mcpm_teardown_wait

first_man_setup:

dmb

@ If the outbound gave up before teardown started, skip cluster setup:

cmp r0, #CLUSTER_UP

beq mcpm_setup_leave

@ power_up_setup is now responsible for setting up the cluster:

cmp r7, #0

mov r0, #1 @ second (cluster) affinity level

blxne r7 @ Call power_up_setup if defined

dmb

mov r0, #CLUSTER_UP

strb r0, [r8, #MCPM_SYNC_CLUSTER_CLUSTER]

dmb

mcpm_setup_leave:

@ Leave the cluster setup critical section:

mov r0, #INBOUND_NOT_COMING_UP

strb r0, [r8, #MCPM_SYNC_CLUSTER_INBOUND]

dsb st

sev

mov r0, r11

bl vlock_unlock @ implies DMB

b mcpm_setup_complete

@ In the contended case, non-first men wait here for cluster setup

@ to complete:

mcpm_setup_wait:

ldrb r0, [r8, #MCPM_SYNC_CLUSTER_CLUSTER]

cmp r0, #CLUSTER_UP

wfene

bne mcpm_setup_wait

dmb

mcpm_setup_complete:

@ If a platform-specific CPU setup hook is needed, it is

@ called from here.

cmp r7, #0

mov r0, #0 @ first (CPU) affinity level

blxne r7 @ Call power_up_setup if defined

dmb

@ Mark the CPU as up:

mov r0, #CPU_UP

strb r0, [r5]

@ Observability order of CPU_UP and opening of the gate does not matter.

mcpm_entry_gated:

ldr r5, [r6, r4, lsl #2] @ r5 = CPU entry vector

cmp r5, #0

wfeeq

beq mcpm_entry_gated

dmb

pr_dbg "released\n"

bx r5

.align 2

//内存池，保存一些特点的变量或函数的地址

3: .word mcpm_entry_early_pokes - .

.word mcpm_entry_vectors - 3b

.word mcpm_power_up_setup_phys - 3b

.word mcpm_sync - 3b

.word first_man_locks - 3b

ENDPROC(mcpm_entry_point)

四、示例三：cpuidle-psci - ARM64平台

psci是Power State Coordination Interface的缩写，是一个固件，通过SMC Calling可以调用到固件里面的函数接口，（实际上函数调用就是执行smc指令，并通过r0指定要调用的函数接口，通过r1~r3传递参数）

当使能了CONFIG_ARM_PSCI_CPUIDLE宏的时候会启用cpuidle-psci，arm64一般走这个分支

4.1 SMC Calling扫盲

所谓的SMC Calling可理解为调用固件里面一个函数，实际上就是执行smc指令，并通过r0指定要调用的函数接口，通过r1~r3传递参数，关于SMC Calling更多更多信息，可以参见下面文档

下面我们以CPU_SUSPEND系列调用为例，介绍SMC Calling

4.1.1 函数调用时的参数和返回值的传递

对于SMC Calling的参数和返回值，参见《DEN0028E_SMC_Calling_Convention-1_4alp0.pdf》的2.6和2.7节，对于smc64：

要调用的function id通过x0传递
调用上面的function需要传递一些参数，参数通过x1~x17传递
上面函数调用完成后需要返回一些数据，返回的数据通过x0~x17返回

当然，对于不同的function id，参数的传递和返回是不一样的，对于CPU_SUSPEND函数调用在《Power_State_Coordination_Interface_PDD_v1_1_DEN0022D.pdf》中的5.2.1节描述如下：

参数由x0~x3这四个寄存器传递，其中x0用于指定function id
返回值通过x0传递

4.1.2 CPU_SUSPEND函数参数和返回值

《Power_State_Coordination_Interface_PDD_v1_1_DEN0022D.pdf》中，对CPU_SUSPEND的描述如下：

由下面可知，CPU_SUSPEND的作用是，使调用者进入对应的low-power状态

下面介绍了CPU_SUSPEND接口的function id和各个参数的含义

由上面可知各个参数的含义为：

参数	含义
function ID	第一个参数：指定调用该函数
power_state	第二个参数：指定要进入的C state 实际就是上面从设备树中提取出自定义的arm,psci-suspend-param信息，对应数组psci_states[4] = {0x00000002, 0x40000003, 0x00000002, 0x40000003} 依次对应小核的state0、小核的state1、大核的state0、大核的state1 各个字段的描述参见手册：在5.4.2节描述
entry_point_address	第三个参数：当cpu从idle退出时，从哪里开始取指执行，这是一个32/64bit的物理地址
context_id	第四个参数：这个参数通过x0或者r0传递上下文，难道就是栈sp的地址？？？仅对调用者caller有效
return	返回值：可为下面，Linux中通过psci_to_linux_errno函数解析： SUCCESS : 0 INVALID_PARAMETERS : -1 INVALID_ADDRESS : -2 DENIED : -3

4.1.2.1 function ID - 指定要调用那个接口

CPU_SUSPEND的function id默认为0xc4000001或者0x84000001

4.1.2.2 power_state - 用于配置要进入什么样的idle等级

参见《Power_State_Coordination_Interface_PDD_v1_1_DEN0022D.pdf》手册5.4.2章节

关于power_state，实际就是下面psci_dt_cpu_init_idle函数中从设备树中解析arm,psci-suspend-param信息，这个字段被赋值给psci_states[4]数组中，psci_states[4] = {0x00000002, 0x40000003, 0x00000002, 0x40000003}，依次对应小核的state0、小核的state1、大核的state0、大核的state1

power_state共有Original format和Extended StateID format两种，这两种格式的含义如下

ps：本文分析的文章实际是Extended StateID format

因为这两种格式各个字段含义是一样的，下面我们简单的来分析各个字段的含义，更多信息请参见手册

4.1.2.2.1 PowerLevel

PowerLevel描述如下，用于指定要控制的电源等级

0: 只控制cpu的电源
1: 控制cluster的电源
2: 控制整个系统的电源

关于电源域的拓扑结构描述如下：

4.1.2.2.2 StateType - cpu是否掉电

0: 表示仅仅是standby状态，此时cpu的上下文还是在的

1: 标志直接给cpu下电了，测试cpu的上下文丢失了，此时需要entry_point_address指定这个cpu唤醒后从哪开始执行，还需要context_id用于保存下电钱的上下文，以便重新上电时恢复现场

4.1.2.2.3 StateID

可用的组合如下：

4.1.2.3 entry_point_address - cpu重新上电后从哪运行

指定cpu从idle唤醒时，从哪开始执行，这个地址只能是物理地址，（当然是因为地址线不过MMU啦）

4.1.2.4 context_id - 一个地址，用于掉电钱保存上下文

因为上面power_state字段，在指定StateType中，有可能会时cpu掉电，此时就需要利用该参数保存掉电前的上下文，以便在后面重新上电时恢复这个上下文

4.1.2.5 psci_to_linux_errno - 解析psci调用返回值

《Power_State_Coordination_Interface_PDD_v1_1_DEN0022D.pdf》中的5.2.2节描述和函数调用的可能返回值

Linux中通过psci_to_linux_errno函数对返回值进行解析

static int psci_to_linux_errno(int errno)

{

switch (errno) {

case PSCI_RET_SUCCESS:

return 0;

case PSCI_RET_NOT_SUPPORTED:

return -EOPNOTSUPP;

case PSCI_RET_INVALID_PARAMS:

case PSCI_RET_INVALID_ADDRESS:

return -EINVAL;

case PSCI_RET_DENIED:

return -EPERM;

};

return -EINVAL;

}

其中：

/* PSCI return values (inclusive of all PSCI versions) */

#define PSCI_RET_SUCCESS 0

#define PSCI_RET_NOT_SUPPORTED -1

#define PSCI_RET_INVALID_PARAMS -2

#define PSCI_RET_DENIED -3

#define PSCI_RET_ALREADY_ON -4

#define PSCI_RET_ON_PENDING -5

#define PSCI_RET_INTERNAL_FAILURE -6

#define PSCI_RET_NOT_PRESENT -7

#define PSCI_RET_DISABLED -8

#define PSCI_RET_INVALID_ADDRESS -9

4.1.3 __invoke_psci_fn_smc - 通过smc指令实现SMC Calling，完成对psci固件里面的函数进行调用

__invoke_psci_fn_smc实现如下：

函数位置：U:\linux-5.10.61\drivers\firmware\psci\psci.c

static unsigned long __invoke_psci_fn_smc(

unsigned long function_id, //要调用的函数

unsigned long arg0, //要进入的idle等级

unsigned long arg1, //退出idle时从哪执行

unsigned long arg2) //传入栈sp地址

{

struct arm_smccc_res res;

//1.调用SMC Calling

arm_smccc_smc(function_id, arg0, arg1, arg2, 0, 0, 0, 0, &res);

//2.返回值保存在x0中

return res.a0;

}

其中：

#define arm_smccc_smc(...) __arm_smccc_smc(__VA_ARGS__, NULL)

4.1.4 __arm_smccc_smc - 汇编完成SMC Calling工作，访问psci固件里面的接口

由下面的函数注释可知，在执行smc命令之前，需要把要传递的参数放进x0~x7这8个寄存器中，smc命令返回时，所调用的function的返回值放在x0~x3这4个寄存器中

另外，一个可选的quirk数据结构提供给厂商用于做一些自定义的操作

/**

* __arm_smccc_smc() - make SMC calls

* @a0-a7: arguments passed in registers 0 to 7

* @res: result values from registers 0 to 3

* @quirk: points to an arm_smccc_quirk, or NULL when no quirks are required.

* This function is used to make SMC calls following SMC Calling Convention.

* The content of the supplied param are copied to registers 0 to 7 prior

* to the SMC instruction. The return values are updated with the content

* from register 0 to 3 on return from the SMC instruction. An optional

* quirk structure provides vendor specific behavior.

asmlinkage void __arm_smccc_smc(unsigned long a0, unsigned long a1,

unsigned long a2, unsigned long a3, unsigned long a4,

unsigned long a5, unsigned long a6, unsigned long a7,

struct arm_smccc_res *res, struct arm_smccc_quirk *quirk);

W:\opensource\linux-5.10.61\arch\arm64\kernel\smccc-call.S

.macro SMCCC instr

//下面调用smc指令，之后cpu将会进入睡眠状态

\instr #0 //调用指令smc #0或者hvc #0

//代码走到这里表示SMC Calling已经完成，cpu已经从idle中唤醒，

//由上面的分析可知，SMC Calling函数调用的返回值记录在x0~x3中，

//而不同的function id使用的寄存器的个数是不一样的，对于CPU_SUSPEND

//只返回值只记录在x0中

//但是__arm_smccc_smc作为一个通用的接口，需要考虑所有的function

//的返回情况，因此这里将x0~x3里面的所有值都保存起来了

ldr x4, [sp] //读取栈顶地址

stp x0, x1, [x4, #ARM_SMCCC_RES_X0_OFFS] //将x0, x1, x2, x3参数压栈

stp x2, x3, [x4, #ARM_SMCCC_RES_X2_OFFS]

//判断是否存在芯片厂商自定义的quirk结构

ldr x4, [sp, #8] //栈顶回退

cbz x4, 1f /* no quirk structure */ //判断地址是不是32位对齐的

ldr x9, [x4, #ARM_SMCCC_QUIRK_ID_OFFS]

cmp x9, #ARM_SMCCC_QUIRK_QCOM_A6

b.ne 1f

str x6, [x4, ARM_SMCCC_QUIRK_STATE_OFFS]

1: ret

.endm

* void arm_smccc_smc(unsigned long a0, unsigned long a1, unsigned long a2,

* unsigned long a3, unsigned long a4, unsigned long a5,

* unsigned long a6, unsigned long a7, struct arm_smccc_res *res,

* struct arm_smccc_quirk *quirk)

SYM_FUNC_START(__arm_smccc_smc)

SMCCC smc

SYM_FUNC_END(__arm_smccc_smc)

EXPORT_SYMBOL(__arm_smccc_smc)

4.1.5 smc指令介绍

参考文档：

https://developer.arm.com/documentation/ddi0597/2021-12/Base-Instructions/SMC--Secure-Monitor-Call-?lang=en

https://blog.csdn.net/u011280717/article/details/77395675

https://www.cnblogs.com/arnoldlu/p/14175126.html

https://blog.csdn.net/chenying126/article/details/78638944

smc指令是Secure Monitor Call的缩写，官方介绍如下：

smc指令对应的机器码如下：

注意：SMC有Thumb编码和ARM编码，T1是Thumb，A1是ARM。可以看到不管是Thumb还是ARM都有一个4位的立即数imm4，这个imm4在armv7a的架构没有定义是干什么，用户可以自己选择如何使用它，通常就跟使用SVC后面的立即数差不多。需要注意的是上图中Thumb指令的低16位在右边，所有imm4其实跟ARM指令的位置是一样的，都是在32位中最低4位，这样在处理获取imm4的时候，我们就不需要去判断到底是ARM指令还是THUMB指令。

设置SMC异常向量表

SMC跟SVC指令很类似，都会进入一种软件异常模式。所以使用SMC必须要提供一个SMC的异常向量表。需要注意的是Monitor模式拥有自己的一套异常向量表，它并不与其他的异常/中断模式共享一套异常向量。Monitor模式所需要的异常向量入口保存在这个MVBAR(Monitor Vector Base Address)寄存器中，需要注意的是MVBAR必须在Secure world的PL1级别下才能够进行读写，也就是说MVBAR在系统中必须由底层软件去设置。

下面看一下如何设置SMC异常向量表，需要注意的是Monitor模式的不仅异常向量是自己用的一套，它的栈也是自己的，与其他模式如SVC模式使用的不是同一个栈，所以在系统初始化的时候需要指定好Monitor模式的栈

不深入研究了，脑壳疼

4.2 arm64中的cpu_operations结构组织关系

在arm64平台中，定义了cpu_operations数据结构，由该数据结构定义的位置也可以知道，该数据结构是专为arm64提供的

W:\opensource\linux-5.10.61\arch\arm64\include\asm\cpu_ops.h

/**

* struct cpu_operations - Callback operations for hotplugging CPUs.

* @name: Name of the property as appears in a devicetree cpu node's

* enable-method property. On systems booting with ACPI, @name

* identifies the struct cpu_operations entry corresponding to

* the boot protocol specified in the ACPI MADT table.

* @cpu_init: Reads any data necessary for a specific enable-method for a

* proposed logical id.

* @cpu_prepare: Early one-time preparation step for a cpu. If there is a

* mechanism for doing so, tests whether it is possible to boot

* the given CPU.

* @cpu_boot: Boots a cpu into the kernel.

* @cpu_postboot: Optionally, perform any post-boot cleanup or necessary

* synchronisation. Called from the cpu being booted.

* @cpu_can_disable: Determines whether a CPU can be disabled based on

* mechanism-specific information.

* @cpu_disable: Prepares a cpu to die. May fail for some mechanism-specific

* reason, which will cause the hot unplug to be aborted. Called

* from the cpu to be killed.

* @cpu_die: Makes a cpu leave the kernel. Must not fail. Called from the

* cpu being killed.

* @cpu_kill: Ensures a cpu has left the kernel. Called from another cpu.

* @cpu_init_idle: Reads any data necessary to initialize CPU idle states for

* a proposed logical id.

* @cpu_suspend: Suspends a cpu and saves the required context. May fail owing

* to wrong parameters or error conditions. Called from the

* CPU being suspended. Must be called with IRQs disabled.

struct cpu_operations {

const char *name;

int (*cpu_init)(unsigned int);

int (*cpu_prepare)(unsigned int);

int (*cpu_boot)(unsigned int);

void (*cpu_postboot)(void);

#ifdef CONFIG_HOTPLUG_CPU

bool (*cpu_can_disable)(unsigned int cpu);

int (*cpu_disable)(unsigned int cpu);

void (*cpu_die)(unsigned int cpu);

int (*cpu_kill)(unsigned int cpu);

#endif

#ifdef CONFIG_CPU_IDLE

int (*cpu_init_idle)(unsigned int);

int (*cpu_suspend)(unsigned long); //进入指定级别的C state

#endif

};

4.2.1 全局数组cpu_ops[NR_CPUS]

每个cpu都有一个自己的ops

W:\opensource\linux-5.10.61\arch\arm64\kernel\cpu_ops.c

static const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init;

4.2.2 init_cpu_ops - 赋值cpu_ops[NR_CPUS]全局数组

* Read a cpu's enable method and record it in cpu_ops.

int __init init_cpu_ops(int cpu)

{

//1.从设备树中获取要使用哪种方法进入idle状态

const char *enable_method = cpu_read_enable_method(cpu);

//2.若设备树中没有指定，则退出

if (!enable_method)

return -ENODEV;

//3.由根据设备树中指定的method信息，赋值cpu_ops

// 注意：如果找不到对应的ops，就会返回-EOPNOTSUPP

cpu_ops[cpu] = cpu_get_ops(enable_method);

if (!cpu_ops[cpu]) {

pr_warn("Unsupported enable-method: %s\n", enable_method);

return -EOPNOTSUPP;

}

return 0;

}

4.2.3 cpu_read_enable_method

static const char *__init cpu_read_enable_method(int cpu)

{

const char *enable_method;

if (acpi_disabled) {

struct device_node *dn = of_get_cpu_node(cpu, NULL);

if (!dn) {

if (!cpu)

pr_err("Failed to find device node for boot cpu\n");

return NULL;

}

//1.解析设备树中，cpu节点的"enable-method"属性

enable_method = of_get_property(dn, "enable-method", NULL);

if (!enable_method) {

* The boot CPU may not have an enable method (e.g.

* when spin-table is used for secondaries).

* Don't warn spuriously.

if (cpu != 0)

pr_err("%pOF: missing enable-method property\n",dn);

}

of_node_put(dn);

} else {

enable_method = acpi_get_enable_method(cpu);

if (!enable_method) {

* In ACPI systems the boot CPU does not require

* checking the enable method since for some

* boot protocol (ie parking protocol) it need not

* be initialized. Don't warn spuriously.

if (cpu != 0)

pr_err("Unsupported ACPI enable-method\n");

}

return enable_method;

}

4.2.4 acpi_get_enable_method

static inline const char *acpi_get_enable_method(int cpu)

{

if (acpi_psci_present())

return "psci";

if (acpi_parking_protocol_valid(cpu))

return "parking-protocol";

return NULL;

}

4.2.5 cpu_get_ops - 由method名称得到对应的cpu_operations

U:\linux-5.10.61\arch\arm64\kernel\cpu_ops.c

static const struct cpu_operations * __init cpu_get_ops(const char *name)

{

const struct cpu_operations *const *ops;

//1.系统中已经定义好了两个全局的ops数组

ops = acpi_disabled ? dt_supported_cpu_ops : acpi_supported_cpu_ops;

while (*ops) {

//2.选择名字匹配的ops

if (!strcmp(name, (*ops)->name))

return *ops;

ops++;

}

return NULL;

}

4.2.6 cpu_ops[NR_CPUS]数组的实现

cpu_ops可能是下面两个全局数组中的一个，对于psci，走的是cpu_psci_ops

static const struct cpu_operations *const dt_supported_cpu_ops[] __initconst = {

&smp_spin_table_ops,

&cpu_psci_ops,

NULL,

};

static const struct cpu_operations *const acpi_supported_cpu_ops[] __initconst = {

#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL

&acpi_parking_protocol_ops,

#endif

&cpu_psci_ops,

NULL,

};

4.2.7 cpu_psci_ops - arm64适用的cpu_operations

注意：这个ops中并没有定义cpu_suspend回调函数哦

const struct cpu_operations cpu_psci_ops = {

.name = "psci",

.cpu_init = cpu_psci_cpu_init,

.cpu_prepare = cpu_psci_cpu_prepare,

.cpu_boot = cpu_psci_cpu_boot,

#ifdef CONFIG_HOTPLUG_CPU

.cpu_can_disable = cpu_psci_cpu_can_disable,

.cpu_disable = cpu_psci_cpu_disable,

.cpu_die = cpu_psci_cpu_die,

.cpu_kill = cpu_psci_cpu_kill,

#endif

};

4.2.8 get_cpu_ops - 获取指定cpu对应的cpu_operations

在arm64平台中，get_cpu_ops实现如下：

const struct cpu_operations *get_cpu_ops(int cpu)

{

return cpu_ops[cpu];

}

4.3 psci_cpuidle_probe - 遍历所有cpu，完成cpuidle driver的初始化

看一下这里的probe函数是不是和cpuidle-arm的驱动初始化函数arm_cpuidle_init函数长得一毛一样呢？该不会psci就是抄cpuidle-arm的吧！哈哈

文件位置：W:\opensource\linux-5.10.61\drivers\cpuidle\cpuidle-psci.c

* psci_idle_probe - Initializes PSCI cpuidle driver

* Initializes PSCI cpuidle driver for all CPUs, if any CPU fails

* to register cpuidle driver then rollback to cancel all CPUs

* registration.

static int psci_cpuidle_probe(struct platform_device *pdev)

{

int cpu, ret;

struct cpuidle_driver *drv;

struct cpuidle_device *dev;

//1.遍历每一个cpu，对每一个cpu设备执行下面的初始化

for_each_possible_cpu(cpu) {

ret = psci_idle_init_cpu(&pdev->dev, cpu);

if (ret)

goto out_fail;

}

psci_idle_init_cpuhp();

return 0;

out_fail:

while (--cpu >= 0) {

dev = per_cpu(cpuidle_devices, cpu);

drv = cpuidle_get_cpu_driver(dev);

cpuidle_unregister(drv);

psci_cpu_deinit_idle(cpu);

}

return ret;

}

4.4 psci_idle_init_cpu - 完成对指定cpu的cpuidle driver的初始化

该函数在psci驱动的probe函数中被调用

static int psci_idle_init_cpu(struct device *dev, int cpu)

{

struct cpuidle_driver *drv;

struct device_node *cpu_node;

const char *enable_method;

int ret = 0;

//1.获取这个cpu对应的设备树节点

cpu_node = of_cpu_device_node_get(cpu);

if (!cpu_node)

return -ENODEV;

* Check whether the enable-method for the cpu is PSCI, fail

* if it is not.

//2.如果这个cpu的"enable-method"属性值不是psci，就退出

enable_method = of_get_property(cpu_node, "enable-method", NULL);

if (!enable_method || (strcmp(enable_method, "psci")))

ret = -ENODEV;

of_node_put(cpu_node);

if (ret)

return ret;

drv = devm_kzalloc(dev, sizeof(*drv), GFP_KERNEL);

if (!drv)

return -ENOMEM;

//3.初始化cpuidle_driver数据结构

drv->name = "psci_idle";

drv->owner = THIS_MODULE;

drv->cpumask = (struct cpumask *)cpumask_of(cpu);

* PSCI idle states relies on architectural WFI to be represented as

* state index 0.

//4.注意，这里指定的进入idle等级的函数为psci_enter_idle_state

drv->states[0].enter = psci_enter_idle_state;

drv->states[0].exit_latency = 1;

drv->states[0].target_residency = 1;

drv->states[0].power_usage = UINT_MAX;

strcpy(drv->states[0].name, "WFI");

strcpy(drv->states[0].desc, "ARM WFI");

* If no DT idle states are detected (ret == 0) let the driver

* initialization fail accordingly since there is no reason to

* initialize the idle driver if only wfi is supported, the

* default archictectural back-end already executes wfi

* on idle entry.

//6.从设备树中解析各个idle等级的信息

// 注意：

// a) 这里从设备树中解析的是标准内核支持的C state属性

// b) 返回值ret表示这个cpu支持多少个C state

ret = dt_init_idle_driver(drv, psci_idle_state_match, 1);

if (ret <= 0)

return ret ? : -ENODEV;

* Initialize PSCI idle states.

//7.解析psci自定义的一些C state属性

ret = psci_cpu_init_idle(dev, drv, cpu, ret);

if (ret) {

pr_err("CPU %d failed to PSCI idle\n", cpu);

return ret;

}

//8.注册这个driver

ret = cpuidle_register(drv, NULL);

if (ret)

goto deinit;

//9.暂不分析

cpuidle_cooling_register(drv);

return 0;

deinit:

psci_cpu_deinit_idle(cpu);

return ret;

}

4.4.1 psci_idle_state_match - match表

其中psci_idle_state_match实现如下：

static const struct of_device_id psci_idle_state_match[] = {

{ .compatible = "arm,idle-state",

.data = psci_enter_idle_state }, //data中指定进入指定C state的函数

{ },

};

4.4.2 psci_cpuidle_data

struct psci_cpuidle_data {

u32 *psci_states; //用于保存每一个idle等级

struct device *dev;

};

4.4.3 psci_enter_idle_state - psci进入指定的C state等级

static int psci_enter_idle_state(

struct cpuidle_device *dev, //哪个cpu要进入

struct cpuidle_driver *drv,

int idx) //要进入的C state基本的索引

{

//1.获取psci自定义的C state信息

// 下面的psci_cpuidle_data.psci_states实际上是来源于设备树中的

// arm,psci-suspend-param属性，在下面的psci_dt_parse_state_node中解析

u32 *state = __this_cpu_read(psci_cpuidle_data.psci_states);

return psci_enter_state(idx, state[idx]);

}

4.4.4 psci_enter_state - 进入指定级别的C state方法

psci_enter_state使指定的cpu进入指定级别的C state，调用的函数如下，这两个函数我们在下一章单独讲解

当idx为0时，调用cpu_do_idle
当idx不为0时，调用psci_cpu_suspend_enter

static inline int psci_enter_state(int idx, u32 state)

{

return CPU_PM_CPU_IDLE_ENTER_PARAM(psci_cpu_suspend_enter, idx, state);

}

其中CPU_PM_CPU_IDLE_ENTER_PARAM定义如下

#define CPU_PM_CPU_IDLE_ENTER_PARAM(low_level_idle_enter, idx, state) \

__CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, state, 0)

4.5 psci_cpu_init_idle - 解析psci自定义的一些C state信息

static int psci_cpu_init_idle(

struct device *dev,

struct cpuidle_driver *drv,

unsigned int cpu, //要初始化哪个cpu

unsigned int state_count) //这个cpu支持多少个C state

{

struct device_node *cpu_node;

int ret;

* If the PSCI cpu_suspend function hook has not been initialized

* idle states must not be enabled, so bail out

//1.如果cpu_suspend没有设置，则退出

// 这个在U:\linux-5.10.61\drivers\firmware\psci\psci.c中设置

if (!psci_ops.cpu_suspend)

return -EOPNOTSUPP;

//2.获取cpu对应的设备树节点

cpu_node = of_cpu_device_node_get(cpu);

if (!cpu_node)

return -ENODEV;

//3.从设备树中解析信息

ret = psci_dt_cpu_init_idle(dev, drv, cpu_node, state_count, cpu);

of_node_put(cpu_node);

return ret;

}

4.5.1 psci_dt_cpu_init_idle - 从设备树中解析psci自定义的C state信息，并赋值给percpu变量psci_cpuidle_data

static int psci_dt_cpu_init_idle(

struct device *dev,

struct cpuidle_driver *drv,

struct device_node *cpu_node, //这个cpu对应的设备树节点是哪一个

unsigned int state_count, //该cpu一共有多少个idle等级

int cpu) //解析哪个cpu的idle等级的信息

{

int i, ret = 0;

u32 *psci_states;

struct device_node *state_node;

//1.每个cpu对应一个该变量，从中获取这个cpu的idle等级

struct psci_cpuidle_data *data = per_cpu_ptr(&psci_cpuidle_data, cpu);

//2.注意：这里执行加操作是为了将normal state对应

// 的C state也包含进来，也就是0级state

state_count++; /* Add WFI state too */

//3.为每个idle等级申请空间

psci_states = devm_kcalloc(dev, state_count, sizeof(*psci_states), GFP_KERNEL);

if (!psci_states)

return -ENOMEM;

//4.遍历设备树中的每一个idle等级，从设备树中提取出信息完成初始化

// 注意，下面在遍历的时候是从1开始的，因为0对应的是normal state，跳过

for (i = 1; i < state_count; i++) {

//4.1 第一步先找出idle等级对应的设备树节点

state_node = of_get_cpu_state_node(cpu_node, i - 1);

if (!state_node)

break;

//4.2 第二步：从设备树中提取出自定义的arm,psci-suspend-param信息，

// 并填充psci_states结构，经过本循环，得到下面数组

// psci_states[4] = {0x00000002, 0x40000003, 0x00000002, 0x40000003}

// 依次对应小核的state0、小核的state1、大核的state0、大核的state1

ret = psci_dt_parse_state_node(state_node, &psci_states[i]);

of_node_put(state_node);

if (ret)

return ret;

pr_debug("psci-power-state %#x index %d\n", psci_states[i], i);

}

//5.是不是所有的idle等级就校验完毕了

if (i != state_count)

return -ENODEV;

/* Initialize optional data, used for the hierarchical topology. */

//4.这一步是在干啥

ret = psci_dt_cpu_init_topology(drv, data, state_count, cpu);

if (ret < 0)

return ret;

/* Idle states parsed correctly, store them in the per-cpu struct. */

//5.赋值给全局的percpu变量psci_cpuidle_data

data->psci_states = psci_states;

return 0;

}

4.5.2 psci_dt_parse_state_node - 从设备树中解析自定义的arm,psci-suspend-param属性

int psci_dt_parse_state_node(struct device_node *np, u32 *state)

{

//读取设备树中arm,psci-suspend-param属性的值

int err = of_property_read_u32(np, "arm,psci-suspend-param", state);

if (err) {

pr_warn("%pOF missing arm,psci-suspend-param property\n", np);

return err;

}

if (!psci_power_state_is_valid(*state)) {

pr_warn("Invalid PSCI power state %#x\n", *state);

return -EINVAL;

}

return 0;

}

设备树示例如下：

U:\linux-5.10.61\arch\arm64\boot\dts\qcom\msm8998.dtsi

idle-states {

entry-method = "psci";

LITTLE_CPU_SLEEP_0: cpu-sleep-0-0 {

compatible = "arm,idle-state";

idle-state-name = "little-retention";

arm,psci-suspend-param = <0x00000002>;

entry-latency-us = <81>;

exit-latency-us = <86>;

min-residency-us = <200>;

};

LITTLE_CPU_SLEEP_1: cpu-sleep-0-1 {

compatible = "arm,idle-state";

idle-state-name = "little-power-collapse";

arm,psci-suspend-param = <0x40000003>;

entry-latency-us = <273>;

exit-latency-us = <612>;

min-residency-us = <1000>;

local-timer-stop;

};

BIG_CPU_SLEEP_0: cpu-sleep-1-0 {

compatible = "arm,idle-state";

idle-state-name = "big-retention";

arm,psci-suspend-param = <0x00000002>;

entry-latency-us = <79>;

exit-latency-us = <82>;

min-residency-us = <200>;

};

BIG_CPU_SLEEP_1: cpu-sleep-1-1 {

compatible = "arm,idle-state";

idle-state-name = "big-power-collapse";

arm,psci-suspend-param = <0x40000003>;

entry-latency-us = <336>;

exit-latency-us = <525>;

min-residency-us = <1000>;

local-timer-stop;

};

4.5.3 psci_dt_cpu_init_topology -

static int psci_dt_cpu_init_topology(

struct cpuidle_driver *drv,

struct psci_cpuidle_data *data,

unsigned int state_count,

int cpu)

{

/* Currently limit the hierarchical topology to be used in OSI mode. */

if (!psci_has_osi_support())

return 0;

data->dev = psci_dt_attach_cpu(cpu);

if (IS_ERR_OR_NULL(data->dev))

return PTR_ERR_OR_ZERO(data->dev);

* Using the deepest state for the CPU to trigger a potential selection

* of a shared state for the domain, assumes the domain states are all

* deeper states.

//1.设置cpu进入指定的idle等级的函数，这里为什么是设置睡眠最深的C state的回调函数

drv->states[state_count - 1].enter = psci_enter_domain_idle_state;

psci_cpuidle_use_cpuhp = true;

return 0;

}

4.5.4 psci_enter_domain_idle_state - cpu进入指定级别的C state

static int psci_enter_domain_idle_state(

struct cpuidle_device *dev,

struct cpuidle_driver *drv,

int idx) //要进入哪个cpu等级

{

struct psci_cpuidle_data *data = this_cpu_ptr(&psci_cpuidle_data);

u32 *states = data->psci_states;

struct device *pd_dev = data->dev;

u32 state;

int ret;

ret = cpu_pm_enter();

if (ret)

return -1;

/* Do runtime PM to manage a hierarchical CPU toplogy. */

RCU_NONIDLE(pm_runtime_put_sync_suspend(pd_dev));

//1.这里得到的是percpu变量domain_state

// 这个percpu变量和CONFIG_ARM_PSCI_CPUIDLE_DOMAIN有关，

// 我们暂且认为该percpu变量始终为0

state = psci_get_domain_state();

//2.psci_cpuidle_data->psci_states中提取idx指定的C state对应的state参数

// 实际就是上面从设备树中提取出自定义的arm,psci-suspend-param信息，

// 对应数组psci_states[4] = {0x00000002, 0x40000003, 0x00000002, 0x40000003}

// 依次对应小核的state0、小核的state1、大核的state0、大核的state1

if (!state)

state = states[idx];

//3.进入idx指定的idle等级，该函数在退出idle前不会返回

ret = psci_cpu_suspend_enter(state) ? -1 : idx;

//4.代码走到这里表示已经从idle中退出了

RCU_NONIDLE(pm_runtime_get_sync(pd_dev));

cpu_pm_exit();

/* Clear the domain state to start fresh when back from idle. */

psci_set_domain_state(0);

return ret;

}

4.6 进入指定级别的C state

4.6.1 cpu_do_idle - 执行wfi指令，进入idle 0

arm64实现如下，执行wfi指令进入idle状态

W:\opensource\linux-5.10.61\arch\arm64\kernel\process.c

* cpu_do_idle()

* Idle the processor (wait for interrupt).

* If the CPU supports priority masking we must do additional work to

* ensure that interrupts are not masked at the PMR (because the core will

* not wake up if we block the wake up signal in the interrupt controller).

void noinstr cpu_do_idle(void)

{

if (system_uses_irq_prio_masking())

__cpu_do_idle_irqprio();

else

__cpu_do_idle();

}

4.6.1.1 __cpu_do_idle - 直接执行wfi进入睡眠

static void noinstr __cpu_do_idle(void)

{

dsb(sy);

wfi();

}

4.6.1.2 __cpu_do_idle_irqprio - 执行wfi之前关闭一些中断相关操作

static void noinstr __cpu_do_idle_irqprio(void)

{

unsigned long pmr;

unsigned long daif_bits;

daif_bits = read_sysreg(daif);

write_sysreg(daif_bits | PSR_I_BIT, daif);

* Unmask PMR before going idle to make sure interrupts can

* be raised.

//在执行wfi之前，先操作gic，完成一些和中断相关的操作

pmr = gic_read_pmr();

gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET);

//执行wfi进入睡眠状态

__cpu_do_idle();

gic_write_pmr(pmr);

write_sysreg(daif_bits, daif);

}

4.6.2 psci_cpu_suspend_enter - 进入指定的等级

//注意：这里传入的参数state，是idx对应的C state对应的一个参数，

//实际就是上面从设备树中提取出自定义的arm,psci-suspend-param信息，

//对应数组psci_states[4] = {0x00000002, 0x40000003, 0x00000002, 0x40000003}

//依次对应小核的state0、小核的state1、大核的state0、大核的state1

int psci_cpu_suspend_enter(u32 state)

{

int ret;

if (!psci_power_state_loses_context(state))

//指定的第二个参数表示从C state中退出时，从哪个物理地址开始取值执行

ret = psci_ops.cpu_suspend(state, 0);

else

ret = cpu_suspend(state, psci_suspend_finisher);

return ret;

}

4.6.2.1 cpu_suspend

arm64实现如下：

U:\linux-5.10.61\arch\arm64\kernel\suspend.c

* cpu_suspend

* arg: argument to pass to the finisher function

* fn: finisher function pointer

int cpu_suspend(

unsigned long arg, //要进入的idle的等级

int (*fn)(unsigned long)) //进入idle的回调函数

{

int ret = 0;

unsigned long flags;

struct sleep_stack_data state;

* From this point debug exceptions are disabled to prevent

* updates to mdscr register (saved and restored along with

* general purpose registers) from kernel debuggers.

flags = local_daif_save();

* Function graph tracer state gets incosistent when the kernel

* calls functions that never return (aka suspend finishers) hence

* disable graph tracing during their execution.

pause_graph_tracing();

if (__cpu_suspend_enter(&state)) {

/* Call the suspend finisher */

//1.调用回调函数进入idle等级

ret = fn(arg);

* Never gets here, unless the suspend finisher fails.

* Successful cpu_suspend() should return from cpu_resume(),

* returning through this code path is considered an error

* If the return value is set to 0 force ret = -EOPNOTSUPP

* to make sure a proper error condition is propagated

//2.代码走到这里表示已经从idle等级中退出来了

if (!ret)

ret = -EOPNOTSUPP;

} else {

RCU_NONIDLE(__cpu_suspend_exit());

}

unpause_graph_tracing();

* Restore pstate flags. OS lock and mdscr have been already

* restored, so from this point onwards, debugging is fully

* renabled if it was enabled when core started shutdown.

local_daif_restore(flags);

return ret;

}

4.6.2.2 psci_suspend_finisher - psci进入指定的C state

static int psci_suspend_finisher(unsigned long state)

{

u32 power_state = state;

//参数power_state，是C state对应的一个参数，实际就是上面从设备树中提取出

//自定义的arm,psci-suspend-param信息，对应数组psci_states[4] = {0x00000002,

//0x40000003, 0x00000002, 0x40000003}，依次对应小核的state0、小核的state1、

//大核的state0、大核的state1

//参数psci_ops.cpu_suspend，表示这个cpu从idle退出后，从哪个物理地址开始运行

return psci_ops.cpu_suspend(power_state, __pa_symbol(cpu_resume));

}

4.6.2.3 全局变量psci_ops.cpu_suspend在哪设置

调用路径：setup_arch -> psci_acpi_init -> psci_probe -> psci_0_2_set_functions

static void __init psci_0_2_set_functions(void)

{

...

//设置函数调用的id

psci_function_id[PSCI_FN_CPU_SUSPEND] = PSCI_FN_NATIVE(0_2, CPU_SUSPEND);

//设置回调函数

psci_ops.cpu_suspend = psci_cpu_suspend;

...

}

其中PSCI_FN_NATIVE实际就是拼接得到函数名，实现如下

* While a 64-bit OS can make calls with SMC32 calling conventions, for some

* calls it is necessary to use SMC64 to pass or return 64-bit values.

* For such calls PSCI_FN_NATIVE(version, name) will choose the appropriate

* (native-width) function ID.

#ifdef CONFIG_64BIT

#define PSCI_FN_NATIVE(version, name) PSCI_##version##_FN64_##name

#else

#define PSCI_FN_NATIVE(version, name) PSCI_##version##_FN_##name

#endif

则上面psci_function_id[PSCI_FN_CPU_SUSPEND]的值为PSCI_0_2_FN64_CPU_SUSPEND，该值定义如下：

#define PSCI_0_2_FN64_CPU_SUSPEND PSCI_0_2_FN64(1)

其中PSCI_0_2_FN64定义如下，计算得到的值为0xc4000001

#define PSCI_0_2_FN_BASE 0x84000000

#define PSCI_0_2_FN(n) (PSCI_0_2_FN_BASE + (n))

#define PSCI_0_2_64BIT 0x40000000

#define PSCI_0_2_FN64_BASE (PSCI_0_2_FN_BASE + PSCI_0_2_64BIT)

#define PSCI_0_2_FN64(n) (PSCI_0_2_FN64_BASE + (n))

则PSCI_0_2_FN64_CPU_SUSPEND的值为0xc4000001，这实际上是ARM的一个SMC Calling调用，也就是调用psci固件里面的一个函数接口，0xc4000001对应就是CPU_SUSPEND函数调用，这个在上面已经分析过了，这里不在赘述

由《DEN0028E_SMC_Calling_Convention-1_4alp0.pdf》中的Table 6-4可知，0xC4000000-0xC400001F对应的是电源控制对应的接口，实际上0xc4000001对应就是CPU_SUSPEND

4.6.2.4 psci_cpu_suspend - psci_ops.cpu_suspend实现，进入指定的C state

注意：下面传入的参数state，是C state对应的一个参数，实际就是上面从设备树中提取出自定义的arm,psci-suspend-param信息，对应数组psci_states[4] = {0x00000002, 0x40000003, 0x00000002, 0x40000003}，依次对应小核的state0、小核的state1、大核的state0、大核的state1

static int psci_cpu_suspend(

u32 state, //要进入的C state

unsigned long entry_point) //退出idle时从哪执行

{

int err;

u32 fn;

//1.第一步，先从这个全局的函数指针数组中找出对应的回调函数对应的idx

// 也就是上面在SMC_Calling中介绍的function id，这里为0xc4000001

fn = psci_function_id[PSCI_FN_CPU_SUSPEND];

//2.第二步，执行这个回调函数

// 注意传入的4个参数，参见上面讲解的《CPU_SUSPEND函数参数和返回值》

// invoke_psci_fn是一个全局的回调函数指针

err = invoke_psci_fn(fn, state, entry_point, 0);

//3.走到这，表示已经从idle中退出了

return psci_to_linux_errno(err);

}

4.6.2.4.1 invoke_psci_fn设置的地方如下：

static void set_conduit(enum arm_smccc_conduit conduit)

{

switch (conduit) {

case SMCCC_CONDUIT_HVC:

invoke_psci_fn = __invoke_psci_fn_hvc;

break;

case SMCCC_CONDUIT_SMC:

invoke_psci_fn = __invoke_psci_fn_smc;

break;

default:

WARN(1, "Unexpected PSCI conduit %d\n", conduit);

}

psci_conduit = conduit;

}

4.6.2.5 cpu_resume - cpu退出idle时从这里执行

.pushsection ".idmap.text", "awx"

SYM_CODE_START(cpu_resume)

bl el2_setup // if in EL2 drop to EL1 cleanly

bl __cpu_setup

/* enable the MMU early - so we can access sleep_save_stash by va */

adrp x1, swapper_pg_dir

bl __enable_mmu

ldr x8, =_cpu_resume

br x8

SYM_CODE_END(cpu_resume)

.ltorg

.popsection

SYM_FUNC_START(_cpu_resume)

mrs x1, mpidr_el1

adr_l x8, mpidr_hash // x8 = struct mpidr_hash virt address

/* retrieve mpidr_hash members to compute the hash */

ldr x2, [x8, #MPIDR_HASH_MASK]

ldp w3, w4, [x8, #MPIDR_HASH_SHIFTS]

ldp w5, w6, [x8, #(MPIDR_HASH_SHIFTS + 8)]

compute_mpidr_hash x7, x3, x4, x5, x6, x1, x2

/* x7 contains hash index, let's use it to grab context pointer */

ldr_l x0, sleep_save_stash

ldr x0, [x0, x7, lsl #3]

add x29, x0, #SLEEP_STACK_DATA_CALLEE_REGS

add x0, x0, #SLEEP_STACK_DATA_SYSTEM_REGS

/* load sp from context */

ldr x2, [x0, #CPU_CTX_SP]

mov sp, x2

* cpu_do_resume expects x0 to contain context address pointer

bl cpu_do_resume

#ifdef CONFIG_KASAN

mov x0, sp

bl kasan_unpoison_task_stack_below

#endif

ldp x19, x20, [x29, #16]

ldp x21, x22, [x29, #32]

ldp x23, x24, [x29, #48]

ldp x25, x26, [x29, #64]

ldp x27, x28, [x29, #80]

ldp x29, lr, [x29] //出栈，取出数据赋值给lr

mov x0, #0

ret

SYM_FUNC_END(_cpu_resume)

五、总结

让我们来回顾一下文章开头的问题

5.1 怎样描述一个C state

不同的平台对"不同等级的C state"描述不一样，进入指定等级的C state的方法也是不一样的，这和平台芯片的设计相关，本文的示例中：

cpuidle-arm: 不管是进入哪一级idle，都是通过wfi指令实现，所不同的是，在进入不同级别的C state之前，会根据不同的级别完成不同的操作，例如关闭一些外设，刷cache等操作
cpuidle-big_little: 只有两个等级的idle，即idle0和idle1，同样，进入这两个等级也是通过wfi指令，所不同的是，在进入idle1时，会根据情况决定是否需要关闭cpu和cluster的电源，以便达到节能的目的，当然从idle1中退出时因为需要对cpu和cluster重新上电操作，会导致在idle1的退出延迟要大
cpuidle-psci: 现在基本所有的arm64平台是由这个driver，在这个driver中通过SMC Calling的方式，调用psci固件里面的CPU_SUSPEND接口，而CPU_SUSPEND的实现，会根据传入的power_state参数的不同，决定是否关闭cluster或cpu的电源

5.2 怎样进入idle

在arm平台中主要有下面两种方法：

执行wfi，但是在执行wfi之前，可能会关闭一些片上外设、cpu、cluster的电源，以达到区分不同等级的C state的目的
通过SMC Calling调用PSCI固件中的接口，（我暂时没有找到SMC Calling的实现代码，有大佬懂这一块的话可以在评论区留言，也可以向本站投稿发文）

关注公众号不迷路：DumpStack

扫码加关注

本作品采用知识共享署名-非商业性使用 4.0 国际许可协议进行许可