struct block_device {dev_t bd_dev; /* not a kdev_t - it's a search key */int bd_openers;struct inode * bd_inode; /* will die */struct super_block * bd_super;......struct block_device * bd_contains;unsigned bd_block_size; u8 bd_partno;struct hd_struct * bd_part; /* number of times partitions within this device have been opened. */unsigned bd_part_count;int bd_invalidated;struct gendisk * bd_disk;struct request_queue * bd_queue;struct backing_dev_info *bd_bdi;struct list_head bd_list;......} __randomize_layout;
struct gendisk { /* major, first_minor and minors are input parameters only, * don't use directly. Use disk_devt() and disk_max_parts(). */int major; /* major number of driver */int first_minor;int minors; /* maximum number of minors, =1 for * disks that can't be partitioned. */char disk_name[DISK_NAME_LEN]; /* name of major driver */char*(*devnode)(struct gendisk *gd,umode_t*mode);......struct disk_part_tbl __rcu *part_tbl;struct hd_struct part0;conststruct block_device_operations *fops;struct request_queue *queue;void*private_data;int flags;struct rw_semaphore lookup_sem;struct kobject *slave_dir;......};staticstruct kobj_map *bdev_map;staticinlinevoidadd_disk(struct gendisk *disk){device_add_disk(NULL, disk);}/** * device_add_disk - add partitioning information to kernel list * @parent: parent device for the disk * @disk: per-device partitioning information * * This function registers the partitioning information in @disk * with the kernel. */voiddevice_add_disk(struct device *parent,struct gendisk *disk){......blk_register_region(disk_devt(disk),disk->minors,NULL, exact_match, exact_lock, disk);.....}/* * Register device numbers dev..(dev+range-1) * range must be nonzero * The hash chain is sorted on range, so that subranges can override. */voidblk_register_region(dev_t devt,unsignedlong range,struct module *module,struct kobject *(*probe)(dev_t,int*,void*),int (*lock)(dev_t,void*),void*data){kobj_map(bdev_map, devt, range, module, probe, lock, data);}
struct hd_struct {sector_t start_sect; /* * nr_sects is protected by sequence counter. One might extend a * partition while IO is happening to it and update of nr_sects * can be non-atomic on 32bit machines with 64bit sector_t. */sector_t nr_sects;......struct device __dev;struct kobject *holder_dir;int policy, partno;struct partition_meta_info *info;......struct disk_stats __percpu *dkstats;......struct percpu_ref ref;struct rcu_work rcu_work;};
struct request_queue { /* * Together with queue_head for cacheline sharing */struct list_head queue_head;struct request *last_merge;struct elevator_queue *elevator;...... request_fn_proc *request_fn; make_request_fn *make_request_fn;......}struct request { struct list_head queuelist;...... struct request_queue *q;...... struct bio *bio; struct bio *biotail;......}
在request结构体中最重要的是bio结构体,在 bio 中bi_next 是链表中的下一项,struct bio_vec 指向一组页面。
struct bio {struct bio *bi_next; /* request queue link */struct block_device *bi_bdev;blk_status_t bi_status;......struct bvec_iter bi_iter;unsignedshort bi_vcnt; /* how many bio_vec's */unsignedshort bi_max_vecs; /* max bvl_vecs we can hold */atomic_t __bi_cnt; /* pin count */struct bio_vec *bi_io_vec; /* the actual vec list */......};struct bio_vec {struct page *bv_page;unsignedint bv_len;unsignedint bv_offset;}
/** * scsi_alloc_sdev - allocate and setup a scsi_Device * @starget: which target to allocate a &scsi_device for * @lun: which lun * @hostdata: usually NULL and set by ->slave_alloc instead * * Description: * Allocate, initialize for io, and return a pointer to a scsi_Device. * Stores the @shost, @channel, @id, and @lun in the scsi_Device, and * adds scsi_Device to the appropriate list. * * Return value: * scsi_Device pointer, or NULL on failure. **/staticstruct scsi_device *scsi_alloc_sdev(struct scsi_target *starget, u64 lun,void*hostdata){struct scsi_device *sdev; sdev =kzalloc(sizeof(*sdev) +shost->transportt->device_size, GFP_ATOMIC);......sdev->request_queue =scsi_alloc_queue(sdev);......}struct request_queue *scsi_alloc_queue(struct scsi_device *sdev){struct Scsi_Host *shost =sdev->host;struct request_queue *q; q =blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE);if (!q)returnNULL;q->cmd_size =sizeof(struct scsi_cmnd) +shost->hostt->cmd_size;q->rq_alloc_data = shost;q->request_fn = scsi_request_fn;q->init_rq_fn = scsi_init_rq;q->exit_rq_fn = scsi_exit_rq;q->initialize_rq_fn = scsi_initialize_rq;//调用blk_queue_make_request(q, blk_queue_bio);if (blk_init_allocated_queue(q)<0) {blk_cleanup_queue(q);returnNULL; }__scsi_init_queue(shost, q);......return q}
staticstruct block_device *bd_acquire(struct inode *inode){struct block_device *bdev;...... bdev =bdget(inode->i_rdev);if (bdev) {spin_lock(&bdev_lock);if (!inode->i_bdev) { /* * We take an additional reference to bd_inode, * and it's released in clear_inode() of inode. * So, we can access it via ->i_mapping always * without igrab(). */bdgrab(bdev);inode->i_bdev = bdev;inode->i_mapping =bdev->bd_inode->i_mapping; }spin_unlock(&bdev_lock); }return bdev;}
/** * get_gendisk - get partitioning information for a given device * @devt: device to get partitioning information for * @partno: returned partition index * * This function gets the structure containing partitioning * information for the given device @devt. */struct gendisk *get_gendisk(dev_t devt,int*partno){struct gendisk *disk =NULL;if (MAJOR(devt)!= BLOCK_EXT_MAJOR) {struct kobject *kobj; kobj =kobj_lookup(bdev_map, devt, partno);if (kobj) disk =dev_to_disk(kobj_to_dev(kobj)); } else {struct hd_struct *part; part =idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));if (part &&get_disk(part_to_disk(part))) {*partno =part->partno; disk =part_to_disk(part); } }return disk;}
do_direct_IO ()里面有两层循环,第一层循环是依次处理这次要写入的所有块。对于每一块,取出对应的内存中的页 page,在这一块中有写入的起始地址 from 和终止地址 to,所以第二层循环就是依次处理 from 到 to 的数据,调用 submit_page_section()提交到块设备层进行写入。
staticintdo_direct_IO(struct dio *dio,struct dio_submit *sdio,struct buffer_head *map_bh){constunsigned blkbits =sdio->blkbits;constunsigned i_blkbits = blkbits +sdio->blkfactor;int ret =0;while (sdio->block_in_file <sdio->final_block_in_request) {struct page *page;size_t from, to; page =dio_get_page(dio, sdio); from =sdio->head ?0:sdio->from; to = (sdio->head ==sdio->tail -1) ?sdio->to : PAGE_SIZE;sdio->head++;while (from < to) {unsigned this_chunk_bytes; /* # of bytes mapped */unsigned this_chunk_blocks; /* # of blocks */...... ret =submit_page_section(dio, sdio, page, from, this_chunk_bytes,sdio->next_block_for_io, map_bh);......sdio->next_block_for_io += this_chunk_blocks;sdio->block_in_file += this_chunk_blocks; from += this_chunk_bytes;dio->result += this_chunk_bytes;sdio->blocks_available -= this_chunk_blocks;if (sdio->block_in_file ==sdio->final_block_in_request)break;...... } }}
submit_page_section() 会调用 dio_bio_submit(),进而调用 submit_bio() 向块设备层提交数据。其中参数 struct bio 是将数据传给块设备的通用传输对象。
/** * submit_bio - submit a bio to the block device layer for I/O * @bio: The &struct bio which describes the I/O */blk_qc_tsubmit_bio(struct bio *bio){......returngeneric_make_request(bio);}
struct mpage_da_data {struct inode *inode;......pgoff_t first_page; /* The first page to write */pgoff_t next_page; /* Current page to examine */pgoff_t last_page; /* Last page to examine */struct ext4_map_blocks map;struct ext4_io_submit io_submit; /* IO submission data */unsignedint do_map:1;};struct ext4_io_submit {......struct bio *io_bio;ext4_io_end_t*io_end;sector_t io_next_block;};
在 io_submit_add_bh() 中,此时的 bio 还是空的,因而我们要调用 io_submit_init_bio()初始化 bio。
make_request_fn() 执行完毕后,可以想象 bio_list_on_stack[0]可能又多了一些 bio 了,接下来的循环中调用 bio_list_pop() 将 bio_list_on_stack[0]积攒的 bio 拿出来,分别放在两个队列 lower 和 same 中,顾名思义,lower 就是更低层次的块设备的 bio,same 是同层次的块设备的 bio。接下来我们能将 lower、same 以及 bio_list_on_stack[1] 都取出来,放在 bio_list_on_stack[0]统一进行处理。当然应该 lower 优先了,因为只有底层的块设备的 I/O 做完了,上层的块设备的 I/O 才能做完。
blk_qc_tgeneric_make_request(struct bio *bio){ /* * bio_list_on_stack[0] contains bios submitted by the current * make_request_fn. * bio_list_on_stack[1] contains bios that were submitted before * the current make_request_fn, but that haven't been processed * yet. */struct bio_list bio_list_on_stack[2];blk_qc_t ret = BLK_QC_T_NONE;......if (current->bio_list) {bio_list_add(¤t->bio_list[0], bio);goto out; }bio_list_init(&bio_list_on_stack[0]);current->bio_list = bio_list_on_stack;do {struct request_queue *q =bdev_get_queue(bio->bi_bdev);if (likely(blk_queue_enter(q,bio->bi_opf & REQ_NOWAIT) ==0)) {struct bio_list lower, same; /* Create a fresh bio_list for all subordinate requests */ bio_list_on_stack[1] = bio_list_on_stack[0];bio_list_init(&bio_list_on_stack[0]); ret =q->make_request_fn(q, bio);blk_queue_exit(q); /* sort new bios into those for a lower level * and those for the same level */bio_list_init(&lower);bio_list_init(&same);while ((bio =bio_list_pop(&bio_list_on_stack[0])) !=NULL)if (q ==bdev_get_queue(bio->bi_bdev))bio_list_add(&same, bio);elsebio_list_add(&lower, bio); /* now assemble so we handle the lowest level first */bio_list_merge(&bio_list_on_stack[0],&lower);bio_list_merge(&bio_list_on_stack[0],&same);bio_list_merge(&bio_list_on_stack[0],&bio_list_on_stack[1]); } ...... bio =bio_list_pop(&bio_list_on_stack[0]); } while (bio);current->bio_list =NULL; /* deactivate */out:return ret;}
staticvoidscsi_request_fn(struct request_queue *q)__releases(q->queue_lock)__acquires(q->queue_lock){struct scsi_device *sdev =q->queuedata;struct Scsi_Host *shost;struct scsi_cmnd *cmd;struct request *req; /* * To start with, we keep looping until the queue is empty, or until * the host is no longer able to accept any more requests. */ shost =sdev->host;for (;;) {int rtn; /* * get next queueable request. We do this early to make sure * that the request is fully prepared even if we cannot * accept it. */ req =blk_peek_request(q);...... /* * Remove the request from the request list. */if (!(blk_queue_tagged(q)&&!blk_queue_start_tag(q, req)))blk_start_request(req);..... cmd =req->special;...... /* * Dispatch the command to the low-level driver. */cmd->scsi_done = scsi_done; rtn =scsi_dispatch_cmd(cmd);...... }return;......}