Skip to content

Commit f9d03f9

Browse files
Christoph Hellwigaxboe
Christoph Hellwig
authored andcommitted
block: improve handling of the magic discard payload
Instead of allocating a single unused biovec for discard requests, send them down without any payload. Instead we allow the driver to add a "special" payload using a biovec embedded into struct request (unioned over other fields never used while in the driver), and overloading the number of segments for this case. This has a couple of advantages: - we don't have to allocate the bio_vec - the amount of special casing for discard requests in the block layer is significantly reduced - using this same scheme for other request types is trivial, which will be important for implementing the new WRITE_ZEROES op on devices where it actually requires a payload (e.g. SCSI) - we can get rid of playing games with the request length, as we'll never touch it and completions will work just fine - it will allow us to support ranged discard operations in the future by merging non-contiguous discard bios into a single request - last but not least it removes a lot of code This patch is the common base for my WIP series for ranges discards and to remove discard_zeroes_data in favor of always using REQ_OP_WRITE_ZEROES, so it would be good to get it in quickly. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@fb.com>
1 parent be07e14 commit f9d03f9

File tree

13 files changed

+76
-138
lines changed

13 files changed

+76
-138
lines changed

block/bio.c

+1-9
Original file line numberDiff line numberDiff line change
@@ -1840,15 +1840,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
18401840
BUG_ON(sectors <= 0);
18411841
BUG_ON(sectors >= bio_sectors(bio));
18421842

1843-
/*
1844-
* Discards need a mutable bio_vec to accommodate the payload
1845-
* required by the DSM TRIM and UNMAP commands.
1846-
*/
1847-
if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE)
1848-
split = bio_clone_bioset(bio, gfp, bs);
1849-
else
1850-
split = bio_clone_fast(bio, gfp, bs);
1851-
1843+
split = bio_clone_fast(bio, gfp, bs);
18521844
if (!split)
18531845
return NULL;
18541846

block/blk-core.c

+2-32
Original file line numberDiff line numberDiff line change
@@ -1475,38 +1475,6 @@ void blk_put_request(struct request *req)
14751475
}
14761476
EXPORT_SYMBOL(blk_put_request);
14771477

1478-
/**
1479-
* blk_add_request_payload - add a payload to a request
1480-
* @rq: request to update
1481-
* @page: page backing the payload
1482-
* @offset: offset in page
1483-
* @len: length of the payload.
1484-
*
1485-
* This allows to later add a payload to an already submitted request by
1486-
* a block driver. The driver needs to take care of freeing the payload
1487-
* itself.
1488-
*
1489-
* Note that this is a quite horrible hack and nothing but handling of
1490-
* discard requests should ever use it.
1491-
*/
1492-
void blk_add_request_payload(struct request *rq, struct page *page,
1493-
int offset, unsigned int len)
1494-
{
1495-
struct bio *bio = rq->bio;
1496-
1497-
bio->bi_io_vec->bv_page = page;
1498-
bio->bi_io_vec->bv_offset = offset;
1499-
bio->bi_io_vec->bv_len = len;
1500-
1501-
bio->bi_iter.bi_size = len;
1502-
bio->bi_vcnt = 1;
1503-
bio->bi_phys_segments = 1;
1504-
1505-
rq->__data_len = rq->resid_len = len;
1506-
rq->nr_phys_segments = 1;
1507-
}
1508-
EXPORT_SYMBOL_GPL(blk_add_request_payload);
1509-
15101478
bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
15111479
struct bio *bio)
15121480
{
@@ -2642,6 +2610,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
26422610
return false;
26432611
}
26442612

2613+
WARN_ON_ONCE(req->rq_flags & RQF_SPECIAL_PAYLOAD);
2614+
26452615
req->__data_len -= total_bytes;
26462616

26472617
/* update sector only for requests with clear definition of sector */

block/blk-lib.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
8080
req_sects = end_sect - sector;
8181
}
8282

83-
bio = next_bio(bio, 1, gfp_mask);
83+
bio = next_bio(bio, 0, gfp_mask);
8484
bio->bi_iter.bi_sector = sector;
8585
bio->bi_bdev = bdev;
8686
bio_set_op_attrs(bio, op, 0);

block/blk-merge.c

+17-36
Original file line numberDiff line numberDiff line change
@@ -241,18 +241,13 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
241241
if (!bio)
242242
return 0;
243243

244-
/*
245-
* This should probably be returning 0, but blk_add_request_payload()
246-
* (Christoph!!!!)
247-
*/
248244
switch (bio_op(bio)) {
249245
case REQ_OP_DISCARD:
250246
case REQ_OP_SECURE_ERASE:
251-
case REQ_OP_WRITE_SAME:
252247
case REQ_OP_WRITE_ZEROES:
248+
return 0;
249+
case REQ_OP_WRITE_SAME:
253250
return 1;
254-
default:
255-
break;
256251
}
257252

258253
fbio = bio;
@@ -410,39 +405,21 @@ __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
410405
*bvprv = *bvec;
411406
}
412407

408+
static inline int __blk_bvec_map_sg(struct request_queue *q, struct bio_vec bv,
409+
struct scatterlist *sglist, struct scatterlist **sg)
410+
{
411+
*sg = sglist;
412+
sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset);
413+
return 1;
414+
}
415+
413416
static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
414417
struct scatterlist *sglist,
415418
struct scatterlist **sg)
416419
{
417420
struct bio_vec bvec, bvprv = { NULL };
418421
struct bvec_iter iter;
419-
int nsegs, cluster;
420-
421-
nsegs = 0;
422-
cluster = blk_queue_cluster(q);
423-
424-
switch (bio_op(bio)) {
425-
case REQ_OP_DISCARD:
426-
case REQ_OP_SECURE_ERASE:
427-
case REQ_OP_WRITE_ZEROES:
428-
/*
429-
* This is a hack - drivers should be neither modifying the
430-
* biovec, nor relying on bi_vcnt - but because of
431-
* blk_add_request_payload(), a discard bio may or may not have
432-
* a payload we need to set up here (thank you Christoph) and
433-
* bi_vcnt is really the only way of telling if we need to.
434-
*/
435-
if (!bio->bi_vcnt)
436-
return 0;
437-
/* Fall through */
438-
case REQ_OP_WRITE_SAME:
439-
*sg = sglist;
440-
bvec = bio_iovec(bio);
441-
sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
442-
return 1;
443-
default:
444-
break;
445-
}
422+
int cluster = blk_queue_cluster(q), nsegs = 0;
446423

447424
for_each_bio(bio)
448425
bio_for_each_segment(bvec, bio, iter)
@@ -462,7 +439,11 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
462439
struct scatterlist *sg = NULL;
463440
int nsegs = 0;
464441

465-
if (rq->bio)
442+
if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
443+
nsegs = __blk_bvec_map_sg(q, rq->special_vec, sglist, &sg);
444+
else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME)
445+
nsegs = __blk_bvec_map_sg(q, bio_iovec(rq->bio), sglist, &sg);
446+
else if (rq->bio)
466447
nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg);
467448

468449
if (unlikely(rq->rq_flags & RQF_COPY_USER) &&
@@ -495,7 +476,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
495476
* Something must have been wrong if the figured number of
496477
* segment is bigger than number of req's physical segments
497478
*/
498-
WARN_ON(nsegs > rq->nr_phys_segments);
479+
WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));
499480

500481
return nsegs;
501482
}

drivers/nvme/host/core.c

+4-13
Original file line numberDiff line numberDiff line change
@@ -239,8 +239,6 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req,
239239
struct nvme_command *cmnd)
240240
{
241241
struct nvme_dsm_range *range;
242-
struct page *page;
243-
int offset;
244242
unsigned int nr_bytes = blk_rq_bytes(req);
245243

246244
range = kmalloc(sizeof(*range), GFP_ATOMIC);
@@ -257,17 +255,10 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req,
257255
cmnd->dsm.nr = 0;
258256
cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
259257

260-
req->completion_data = range;
261-
page = virt_to_page(range);
262-
offset = offset_in_page(range);
263-
blk_add_request_payload(req, page, offset, sizeof(*range));
264-
265-
/*
266-
* we set __data_len back to the size of the area to be discarded
267-
* on disk. This allows us to report completion on the full amount
268-
* of blocks described by the request.
269-
*/
270-
req->__data_len = nr_bytes;
258+
req->special_vec.bv_page = virt_to_page(range);
259+
req->special_vec.bv_offset = offset_in_page(range);
260+
req->special_vec.bv_len = sizeof(*range);
261+
req->rq_flags |= RQF_SPECIAL_PAYLOAD;
271262

272263
return BLK_MQ_RQ_QUEUE_OK;
273264
}

drivers/nvme/host/nvme.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -236,8 +236,10 @@ static inline unsigned nvme_map_len(struct request *rq)
236236

237237
static inline void nvme_cleanup_cmd(struct request *req)
238238
{
239-
if (req_op(req) == REQ_OP_DISCARD)
240-
kfree(req->completion_data);
239+
if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
240+
kfree(page_address(req->special_vec.bv_page) +
241+
req->special_vec.bv_offset);
242+
}
241243
}
242244

243245
static inline int nvme_error_status(u16 status)

drivers/nvme/host/pci.c

+14-13
Original file line numberDiff line numberDiff line change
@@ -302,14 +302,14 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
302302
static __le64 **iod_list(struct request *req)
303303
{
304304
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
305-
return (__le64 **)(iod->sg + req->nr_phys_segments);
305+
return (__le64 **)(iod->sg + blk_rq_nr_phys_segments(req));
306306
}
307307

308308
static int nvme_init_iod(struct request *rq, unsigned size,
309309
struct nvme_dev *dev)
310310
{
311311
struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
312-
int nseg = rq->nr_phys_segments;
312+
int nseg = blk_rq_nr_phys_segments(rq);
313313

314314
if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
315315
iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC);
@@ -339,8 +339,6 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
339339
__le64 **list = iod_list(req);
340340
dma_addr_t prp_dma = iod->first_dma;
341341

342-
nvme_cleanup_cmd(req);
343-
344342
if (iod->npages == 0)
345343
dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
346344
for (i = 0; i < iod->npages; i++) {
@@ -510,7 +508,7 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req,
510508
DMA_TO_DEVICE : DMA_FROM_DEVICE;
511509
int ret = BLK_MQ_RQ_QUEUE_ERROR;
512510

513-
sg_init_table(iod->sg, req->nr_phys_segments);
511+
sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
514512
iod->nents = blk_rq_map_sg(q, req, iod->sg);
515513
if (!iod->nents)
516514
goto out;
@@ -566,6 +564,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
566564
}
567565
}
568566

567+
nvme_cleanup_cmd(req);
569568
nvme_free_iod(dev, req);
570569
}
571570

@@ -596,20 +595,20 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
596595
}
597596
}
598597

599-
map_len = nvme_map_len(req);
600-
ret = nvme_init_iod(req, map_len, dev);
598+
ret = nvme_setup_cmd(ns, req, &cmnd);
601599
if (ret != BLK_MQ_RQ_QUEUE_OK)
602600
return ret;
603601

604-
ret = nvme_setup_cmd(ns, req, &cmnd);
602+
map_len = nvme_map_len(req);
603+
ret = nvme_init_iod(req, map_len, dev);
605604
if (ret != BLK_MQ_RQ_QUEUE_OK)
606-
goto out;
605+
goto out_free_cmd;
607606

608-
if (req->nr_phys_segments)
607+
if (blk_rq_nr_phys_segments(req))
609608
ret = nvme_map_data(dev, req, map_len, &cmnd);
610609

611610
if (ret != BLK_MQ_RQ_QUEUE_OK)
612-
goto out;
611+
goto out_cleanup_iod;
613612

614613
blk_mq_start_request(req);
615614

@@ -620,14 +619,16 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
620619
else
621620
ret = BLK_MQ_RQ_QUEUE_ERROR;
622621
spin_unlock_irq(&nvmeq->q_lock);
623-
goto out;
622+
goto out_cleanup_iod;
624623
}
625624
__nvme_submit_cmd(nvmeq, &cmnd);
626625
nvme_process_cq(nvmeq);
627626
spin_unlock_irq(&nvmeq->q_lock);
628627
return BLK_MQ_RQ_QUEUE_OK;
629-
out:
628+
out_cleanup_iod:
630629
nvme_free_iod(dev, req);
630+
out_free_cmd:
631+
nvme_cleanup_cmd(req);
631632
return ret;
632633
}
633634

drivers/nvme/host/rdma.c

+5-8
Original file line numberDiff line numberDiff line change
@@ -952,8 +952,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
952952
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
953953
struct nvme_rdma_device *dev = queue->device;
954954
struct ib_device *ibdev = dev->dev;
955-
int nents, count;
956-
int ret;
955+
int count, ret;
957956

958957
req->num_sge = 1;
959958
req->inline_data = false;
@@ -965,16 +964,14 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
965964
return nvme_rdma_set_sg_null(c);
966965

967966
req->sg_table.sgl = req->first_sgl;
968-
ret = sg_alloc_table_chained(&req->sg_table, rq->nr_phys_segments,
969-
req->sg_table.sgl);
967+
ret = sg_alloc_table_chained(&req->sg_table,
968+
blk_rq_nr_phys_segments(rq), req->sg_table.sgl);
970969
if (ret)
971970
return -ENOMEM;
972971

973-
nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
974-
BUG_ON(nents > rq->nr_phys_segments);
975-
req->nents = nents;
972+
req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
976973

977-
count = ib_dma_map_sg(ibdev, req->sg_table.sgl, nents,
974+
count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents,
978975
rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
979976
if (unlikely(count <= 0)) {
980977
sg_free_table_chained(&req->sg_table, true);

drivers/nvme/target/loop.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -185,13 +185,13 @@ static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
185185
if (blk_rq_bytes(req)) {
186186
iod->sg_table.sgl = iod->first_sgl;
187187
ret = sg_alloc_table_chained(&iod->sg_table,
188-
req->nr_phys_segments, iod->sg_table.sgl);
188+
blk_rq_nr_phys_segments(req),
189+
iod->sg_table.sgl);
189190
if (ret)
190191
return BLK_MQ_RQ_QUEUE_BUSY;
191192

192193
iod->req.sg = iod->sg_table.sgl;
193194
iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl);
194-
BUG_ON(iod->req.sg_cnt > req->nr_phys_segments);
195195
}
196196

197197
blk_mq_start_request(req);

drivers/scsi/scsi_lib.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -1007,8 +1007,8 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb)
10071007
/*
10081008
* If sg table allocation fails, requeue request later.
10091009
*/
1010-
if (unlikely(sg_alloc_table_chained(&sdb->table, req->nr_phys_segments,
1011-
sdb->table.sgl)))
1010+
if (unlikely(sg_alloc_table_chained(&sdb->table,
1011+
blk_rq_nr_phys_segments(req), sdb->table.sgl)))
10121012
return BLKPREP_DEFER;
10131013

10141014
/*
@@ -1040,7 +1040,7 @@ int scsi_init_io(struct scsi_cmnd *cmd)
10401040
bool is_mq = (rq->mq_ctx != NULL);
10411041
int error;
10421042

1043-
BUG_ON(!rq->nr_phys_segments);
1043+
BUG_ON(!blk_rq_nr_phys_segments(rq));
10441044

10451045
error = scsi_init_sgtable(rq, &cmd->sdb);
10461046
if (error)

0 commit comments

Comments
 (0)