Skip to content

Commit 6edc381

Browse files
amotinRyan Moeller
authored and
Ryan Moeller
committed
Split dmu_zfetch() speculation and execution parts
To make better predictions on parallel workloads dmu_zfetch() should be called as early as possible to reduce possible request reordering. In particular, it should be called before dmu_buf_hold_array_by_dnode() calls dbuf_hold(), which may sleep waiting for indirect blocks, waking up multiple threads same time on completion, that can significantly reorder the requests, making the stream look like random. But we should not issue prefetch requests before the on-demand ones, since they may get to the disks first despite the I/O scheduler, increasing on-demand request latency. This patch splits dmu_zfetch() into two functions: dmu_zfetch_prepare() and dmu_zfetch_run(). The first can be executed as early as needed. It only updates statistics and makes predictions without issuing any I/Os. The I/O issuance is handled by dmu_zfetch_run(), which can be called later when all on-demand I/Os are already issued. It even tracks the activity of other concurrent threads, issuing the prefetch only when _all_ on-demand requests are issued. For many years it was a big problem for storage servers, handling deeper request queues from their clients, having to either serialize consequential reads to make ZFS prefetcher usable, or execute the incoming requests as-is and get almost no prefetch from ZFS, relying only on deep enough prefetch by the clients. Benefits of those ways varied, but neither was perfect. With this patch deeper queue sequential read benchmarks with CrystalDiskMark from Windows via iSCSI to FreeBSD target show me much better throughput with almost 100% prefetcher hit rate, comparing to almost zero before. While there, I also removed per-stream zs_lock as useless, completely covered by parent zf_lock. Also I reused zs_blocks refcount to track zf_stream linkage of the stream, since I believe previous zs_fetch == NULL check in dmu_zfetch_stream_done() was racy. Delete prefetch streams when they reach ends of files. It saves up to 1KB of RAM per file, plus reduces searches through the stream list. Block data prefetch (speculation and indirect block prefetch is still done since they are cheaper) if all dbufs of the stream are already in DMU cache. First cache miss immediately fires all the prefetch that would be done for the stream by that time. It saves some CPU time if same files within DMU cache capacity are read over and over. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Adam Moss <c@yotes.com> Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored-By: iXsystems, Inc. Closes openzfs#11652
1 parent a340199 commit 6edc381

File tree

4 files changed

+195
-120
lines changed

4 files changed

+195
-120
lines changed

include/sys/dmu_zfetch.h

+16-7
Original file line numberDiff line numberDiff line change
@@ -49,28 +49,37 @@ typedef struct zfetch {
4949

5050
typedef struct zstream {
5151
uint64_t zs_blkid; /* expect next access at this blkid */
52-
uint64_t zs_pf_blkid; /* next block to prefetch */
52+
uint64_t zs_pf_blkid1; /* first block to prefetch */
53+
uint64_t zs_pf_blkid; /* block to prefetch up to */
5354

5455
/*
5556
* We will next prefetch the L1 indirect block of this level-0
5657
* block id.
5758
*/
58-
uint64_t zs_ipf_blkid;
59+
uint64_t zs_ipf_blkid1; /* first block to prefetch */
60+
uint64_t zs_ipf_blkid; /* block to prefetch up to */
5961

60-
kmutex_t zs_lock; /* protects stream */
61-
hrtime_t zs_atime; /* time last prefetch issued */
62-
hrtime_t zs_start_time; /* start of last prefetch */
6362
list_node_t zs_node; /* link for zf_stream */
63+
hrtime_t zs_atime; /* time last prefetch issued */
6464
zfetch_t *zs_fetch; /* parent fetch */
65-
zfs_refcount_t zs_blocks; /* number of pending blocks in the stream */
65+
boolean_t zs_missed; /* stream saw cache misses */
66+
zfs_refcount_t zs_callers; /* number of pending callers */
67+
/*
68+
* Number of stream references: dnode, callers and pending blocks.
69+
* The stream memory is freed when the number returns to zero.
70+
*/
71+
zfs_refcount_t zs_refs;
6672
} zstream_t;
6773

6874
void zfetch_init(void);
6975
void zfetch_fini(void);
7076

7177
void dmu_zfetch_init(zfetch_t *, struct dnode *);
7278
void dmu_zfetch_fini(zfetch_t *);
73-
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t,
79+
zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t,
80+
boolean_t);
81+
void dmu_zfetch_run(zstream_t *, boolean_t, boolean_t);
82+
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
7483
boolean_t);
7584

7685

module/zfs/dbuf.c

+3-2
Original file line numberDiff line numberDiff line change
@@ -1640,7 +1640,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
16401640
mutex_exit(&db->db_mtx);
16411641
if (err == 0 && prefetch) {
16421642
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
1643-
flags & DB_RF_HAVESTRUCT);
1643+
B_FALSE, flags & DB_RF_HAVESTRUCT);
16441644
}
16451645
DB_DNODE_EXIT(db);
16461646
DBUF_STAT_BUMP(hash_hits);
@@ -1662,6 +1662,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
16621662
*/
16631663
if (!err && prefetch) {
16641664
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
1665+
db->db_state != DB_CACHED,
16651666
flags & DB_RF_HAVESTRUCT);
16661667
}
16671668

@@ -1691,7 +1692,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
16911692
mutex_exit(&db->db_mtx);
16921693
if (prefetch) {
16931694
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
1694-
flags & DB_RF_HAVESTRUCT);
1695+
B_TRUE, flags & DB_RF_HAVESTRUCT);
16951696
}
16961697
DB_DNODE_EXIT(db);
16971698
DBUF_STAT_BUMP(hash_misses);

module/zfs/dmu.c

+29-8
Original file line numberDiff line numberDiff line change
@@ -497,10 +497,12 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
497497
boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
498498
{
499499
dmu_buf_t **dbp;
500+
zstream_t *zs = NULL;
500501
uint64_t blkid, nblks, i;
501502
uint32_t dbuf_flags;
502503
int err;
503-
zio_t *zio;
504+
zio_t *zio = NULL;
505+
boolean_t missed = B_FALSE;
504506

505507
ASSERT(length <= DMU_MAX_ACCESS);
506508

@@ -534,29 +536,48 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
534536

535537
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
536538
blkid = dbuf_whichblock(dn, 0, offset);
539+
if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
540+
DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
541+
/*
542+
* Prepare the zfetch before initiating the demand reads, so
543+
* that if multiple threads block on same indirect block, we
544+
* base predictions on the original less racy request order.
545+
*/
546+
zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks,
547+
read && DNODE_IS_CACHEABLE(dn), B_TRUE);
548+
}
537549
for (i = 0; i < nblks; i++) {
538550
dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
539551
if (db == NULL) {
552+
if (zs)
553+
dmu_zfetch_run(zs, missed, B_TRUE);
540554
rw_exit(&dn->dn_struct_rwlock);
541555
dmu_buf_rele_array(dbp, nblks, tag);
542556
zio_nowait(zio);
543557
return (SET_ERROR(EIO));
544558
}
545559

546-
/* initiate async i/o */
547-
if (read)
560+
/*
561+
* Initiate async demand data read.
562+
* We check the db_state after calling dbuf_read() because
563+
* (1) dbuf_read() may change the state to CACHED due to a
564+
* hit in the ARC, and (2) on a cache miss, a child will
565+
* have been added to "zio" but not yet completed, so the
566+
* state will not yet be CACHED.
567+
*/
568+
if (read) {
548569
(void) dbuf_read(db, zio, dbuf_flags);
570+
if (db->db_state != DB_CACHED)
571+
missed = B_TRUE;
572+
}
549573
dbp[i] = &db->db;
550574
}
551575

552576
if (!read)
553577
zfs_racct_write(length, nblks);
554578

555-
if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
556-
DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
557-
dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
558-
read && DNODE_IS_CACHEABLE(dn), B_TRUE);
559-
}
579+
if (zs)
580+
dmu_zfetch_run(zs, missed, B_TRUE);
560581
rw_exit(&dn->dn_struct_rwlock);
561582

562583
/* wait for async i/o */

0 commit comments

Comments
 (0)