这篇文章紧接上回分解,在nvme_probe函数的最后一步调用nvme_reset_work进行reset操作,nvme_reset_work的主要工作可以概括如下几个步骤:

  1. 进入nvme_reset_work函数后先检查NVME_CTRL_RESETTING标志,来确保nvme_reset_work不会被重复进入。

  2. 调用nvme_pci_enable

  3. 调用nvme_configure_admin_queue

  4. 调用nvme_init_queue

  5. 调用nvme_alloc_admin_tags

  6. 调用nvme_init_identify

  7. 调用nvme_setup_io_queues

  8. 调用nvme_start_queues/nvme_dev_add之后,接着调用nvme_queue_scan

上篇文章中,我们解析了nvme_init_identify内容,本文我们接着介绍nvme_reset_work中的其他函数。

我们来看看nvme_setup_io_queues的代码:

static int nvme_setup_io_queues(struct nvme_dev *dev)

{

struct nvme_queue *adminq = dev->queues[0];

struct pci_dev *pdev = to_pci_dev(dev->dev);

int result, nr_io_queues, size;

       // 获取cpu core数目并赋值给nr_io_queues

nr_io_queues = num_online_cpus();

        // 发送set feature cmd设置IO queues数目

result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);

if (result < 0)

return result;

if (nr_io_queues == 0)

return 0;

if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {

result = nvme_cmb_qdepth(dev, nr_io_queues,

sizeof(struct nvme_command));

if (result > 0)

dev->q_depth = result;

else

nvme_release_cmb(dev);

}

         // 计算所需的bar size

size = db_bar_size(dev, nr_io_queues);

if (size > 8192) {

iounmap(dev->bar);

do {

dev->bar = ioremap(pci_resource_start(pdev, 0), size);

if (dev->bar)

break;

if (!--nr_io_queues)

return -ENOMEM;

size = db_bar_size(dev, nr_io_queues);

} while (1);

dev->dbs = dev->bar + 4096;

adminq->q_db = dev->dbs;

}

free_irq(pci_irq_vector(pdev, 0), adminq);

pci_free_irq_vectors(pdev);

        // 分配中断

nr_io_queues = pci_alloc_irq_vectors(pdev, 1, nr_io_queues,

PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY);

if (nr_io_queues <= 0)

return -EIO;

dev->max_qid = nr_io_queues;

result = queue_request_irq(adminq);

if (result) {

adminq->cq_vector = -1;

return result;

}

         // 创建IO queues

return nvme_create_io_queues(dev);

}

上面代码虽然看上去很长,但是我们只抓重点,在nvme_setup_io_queues函数执行的过程中主要分为两步:

  1. 调用nvme_set_queue_count发送set feature cmd设置IO queues的数目;

  2. 确定了IO queues的数目之后,调用nvme_creat_io_queues函数开始真正干活了,完成IO queues的创建。

我们先看看nvme_set_queue_count的内容:

int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)

{

u32 q_count = (*count - 1) | ((*count - 1) << 16);

u32 result;

int status, nr_io_queues;

status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,

&result);

if (status < 0)

return status;

if (status > 0) {

dev_err(ctrl->dev, "Could not set queue count (%d)\n", status);

*count = 0;

} else {

nr_io_queues = min(result & 0xffff, result >> 16) + 1;

*count = min(*count, nr_io_queues);

}

return 0;

}

再打开nvme_set_features:

int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,

     void *buffer, size_t buflen, u32 *result)

{

struct nvme_command c;

union nvme_result res;

int ret;

memset(&c, 0, sizeof(c));

c.features.opcode = nvme_admin_set_features;

c.features.fid = cpu_to_le32(fid);

c.features.dword11 = cpu_to_le32(dword11);

ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,

buffer, buflen, 0, NVME_QID_ANY, 0, 0);

if (ret >= 0 && result)

*result = le32_to_cpu(res.u32);

return ret;

}

set feature的opcode是0x09h, 如下图:

IO queues数目设置在set feature command中的feature ID=0x7h, 如下图:

IO queues的具体数目在Dword11设置,如下图,

 set feature command的三个关键参数(opcode, fid, dword11)配置完成后,再调用__nvme_submit_sync_cmd去执行,最终完成IO queues数目的设置,__nvme_submit_sync_cmd的执行过程已在上篇文章介绍,这里不再展开,欲了解,请移步上文:

Linux NVMe Driver学习笔记之7:Identify初始化及命令提交过程

至此,IO queues数目的配置工作完成,那我们接下来就看看nvme_setup_io_queues函数中最关键的一步:调用nvme_creat_io_queues函数创建IO queues.

static int nvme_create_io_queues(struct nvme_dev *dev)

{

unsigned i, max;

int ret = 0;

      //分配nvmeq结构体,并记录到dev->queues[]数组中,并分配submit queue 和complete queue所需要的空间

for (i = dev->queue_count; i <= dev->max_qid; i++) {

if (!nvme_alloc_queue(dev, i, dev->q_depth)) {

ret = -ENOMEM;

break;

}

}

max = min(dev->max_qid, dev->queue_count - 1);

for (i = dev->online_queues; i <= max; i++) {

ret = nvme_create_queue(dev->queues[i], i);

if (ret)

break;

}

/*

* Ignore failing Create SQ/CQ commands, we can continue with less

* than the desired aount of queues, and even a controller without

* I/O queues an still be used to issue admin commands.  This might

* be useful to upgrade a buggy firmware for example.

*/

return ret >= 0 ? 0 : ret;

}

上面的代码显示nvme_create_io_queue在创建IO queues过程中主要进行了两步:

  • 调用nvme_alloc_queue申请SQ/CQ所需内存,这部分已在

    nvme_configure_admin_queue解析过程中介绍,这里略过了。详细过程请参考: Linux NVMe Driver学习笔记之5:Admin SQ/CQ的创建

  • 第二步是关键,调用nvme_create_queue真正实现SQ/CQ的创建。

我们接下来先看看nvme_create_queue的代码:

static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)

{

struct nvme_dev *dev = nvmeq->dev;

int result;

nvmeq->cq_vector = qid - 1;

result = adapter_alloc_cq(dev, qid, nvmeq);

if (result < 0)

return result;

result = adapter_alloc_sq(dev, qid, nvmeq);

if (result < 0)

goto release_cq;

result = queue_request_irq(nvmeq);

if (result < 0)

goto release_sq;

nvme_init_queue(nvmeq, qid);

return result;

 release_sq:

adapter_delete_sq(dev, qid);

 release_cq:

adapter_delete_cq(dev, qid);

return result;

}

从代码显示,nvme_create_queue函数先通过调用adapter_alloc_cq和adapter_alloc_sq创建CQ/SQ, 然后在调用queue_request_irq申请中断,最后调用nvme_init_queue初始化前面创建的CQ/SQ. 

鉴于queue_request_irq和nvme_init_queue这两个函数已在之前的文章中介绍,这里略过了。详细过程请参考: Linux NVMe Driver学习笔记之5:Admin SQ/CQ的创建

我们这里主要分析adapter_alloc_cqadapter_alloc_sq:

static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,

struct nvme_queue *nvmeq)

{

struct nvme_command c;

int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;

/*

* Note: we (ab)use the fact the the prp fields survive if no data

* is attached to the request.

*/

memset(&c, 0, sizeof(c));

c.create_cq.opcode = nvme_admin_create_cq; 

c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);

c.create_cq.cqid = cpu_to_le16(qid);

c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);

c.create_cq.cq_flags = cpu_to_le16(flags);

c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);

return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);

}

Create I/O completion queue command的Opcode=0x05

PRP Entry 1要结合PC(Physically Contiguous) flag:

  • 如果PC=1, 代表CQ物理空间是连续的,指向PRP1;

  • 如果PC=0, 代表CQ物理空间是不连续的,指向PRP list.

PRP结构及实例详细解析,请参考:

NVMe系列专题之四:寻址模型PRP和SGL解析

cqid和qsize分别代表CQ的编号以及队列深度:

cq_flags在这里是pc flag与IEN flag合并的值,irq_vector代表是CQ的中断向量。

int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;

有关中断模式的具体区别请参考: 

NVMe系列专题之五:中断机制

create I/O completion queue command的6个关键参数(opcode, PRP1, cqid, qsize, cq_flags, irq_vector)配置完成后,再调用__nvme_submit_sync_cmd去执行,最终完成IO queues数目的设置,__nvme_submit_sync_cmd的执行过程已在上篇文章介绍,这里不再展开,欲了解,请移步上文:

Linux NVMe Driver学习笔记之7:Identify初始化及命令提交过程

static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,

struct nvme_queue *nvmeq)

{

struct nvme_command c;

int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;

/*

* Note: we (ab)use the fact the the prp fields survive if no data

* is attached to the request.

*/

memset(&c, 0, sizeof(c));

c.create_sq.opcode = nvme_admin_create_sq;

c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);

c.create_sq.sqid = cpu_to_le16(qid);

c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);

c.create_sq.sq_flags = cpu_to_le16(flags);

c.create_sq.cqid = cpu_to_le16(qid);

return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);

}

create I/O SQ命令中的opcode, prp1,sqid,qsize与create I/O CQ命令类似, 这里主要提一下sq_flags和cqid.

cq_flags在这里是pc flag与QPRIO flag合并的值,

int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;

QPRIO=Queue Priority, 代表了SQ中命令的冲裁级别。在NVMe Spec没有规定Command存入SQ队列的执行顺序,Controller可以一次取出多个Command进行批量处理。一个SQ队列中的Command执行顺序是不固定,同时在多个SQ队列之间的Command执行顺序也不固定,这就涉及到了NVMe Spec定义的命令仲裁机制。更详细的介绍请参考:

NVMe系列专题之三:命令仲裁机制

create I/O CQ命令相比,create I/O SQ命令多了一个cqid值。因为SQ和CQ是相互对应的,IO SQ和CQ可以一对一,也可以多对一。有关SQ和CQ的具体介绍,请参考:

NVMe系列专题之二:队列(Queue)管理

create I/O submission queue command的6个关键参数(opcode, PRP1, sqid, qsize, sq_flags, cqid)配置完成后,再调用__nvme_submit_sync_cmd去执行,最终完成IO queues数目的设置,__nvme_submit_sync_cmd的执行过程已在上篇文章介绍,这里不再展开,欲了解,请移步上文:

Linux NVMe Driver学习笔记之7:Identify初始化及命令提交过程

Logo

更多推荐