aboutsummaryrefslogtreecommitdiff
path: root/sys/kern
diff options
context:
space:
mode:
authorMateusz Guzik <mjg@FreeBSD.org>2016-09-10 16:29:53 +0000
committerMateusz Guzik <mjg@FreeBSD.org>2016-09-10 16:29:53 +0000
commita27815330cdbe9cadfe2b4a3e761ed9093d0f374 (patch)
treed2601cb0c515a3d970b11d4e8832c534acf78d7a /sys/kern
parent20e45e033ce07f0cfca9605e295fb836011f8971 (diff)
downloadsrc-a27815330cdbe9cadfe2b4a3e761ed9093d0f374.tar.gz
src-a27815330cdbe9cadfe2b4a3e761ed9093d0f374.zip
cache: improve scalability by introducing bucket locks
An array of bucket locks is added. All modifications still require the global cache_lock to be held for writing. However, most readers only need the relevant bucket lock and in effect can run concurrently to the writer as long as they use a different lock. See the added comment for more details. This is an intermediate step towards removal of the global lock. Reviewed by: kib Tested by: pho
Notes
Notes: svn path=/head/; revision=305684
Diffstat (limited to 'sys/kern')
-rw-r--r--sys/kern/subr_witness.c8
-rw-r--r--sys/kern/vfs_cache.c268
2 files changed, 216 insertions, 60 deletions
diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c
index d4359efbd227..ed1203578c2c 100644
--- a/sys/kern/subr_witness.c
+++ b/sys/kern/subr_witness.c
@@ -623,6 +623,14 @@ static struct witness_order_list_entry order_lists[] = {
{ "vnode interlock", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
+ * VFS namecache
+ */
+ { "ncglobal", &lock_class_rw },
+ { "ncbuc", &lock_class_rw },
+ { "vnode interlock", &lock_class_mtx_sleep },
+ { "ncneg", &lock_class_mtx_sleep },
+ { NULL, NULL },
+ /*
* ZFS locking
*/
{ "dn->dn_mtx", &lock_class_sx },
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index 499b80c640c4..40623eab48c3 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/sdt.h>
+#include <sys/smp.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
@@ -148,6 +149,23 @@ struct namecache_ts {
* Upon reaching the last segment of a path, if the reference
* is for DELETE, or NOCACHE is set (rewrite), and the
* name is located in the cache, it will be dropped.
+ *
+ * These locks are used (in the order in which they can be taken):
+ * NAME TYPE ROLE
+ * cache_lock rwlock global, needed for all modifications
+ * bucketlock rwlock for access to given hash bucket
+ * ncneg_mtx mtx negative entry LRU management
+ *
+ * A name -> vnode lookup can be safely performed by either locking cache_lock
+ * or the relevant hash bucket.
+ *
+ * ".." and vnode -> name lookups require cache_lock.
+ *
+ * Modifications require both cache_lock and relevant bucketlock taken for
+ * writing.
+ *
+ * Negative entry LRU management requires ncneg_mtx taken on top of either
+ * cache_lock or bucketlock.
*/
/*
@@ -179,8 +197,9 @@ SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
struct nchstats nchstats; /* cache effectiveness statistics */
static struct rwlock cache_lock;
-RW_SYSINIT(vfscache, &cache_lock, "Name Cache");
+RW_SYSINIT(vfscache, &cache_lock, "ncglobal");
+#define CACHE_TRY_WLOCK() rw_try_wlock(&cache_lock)
#define CACHE_UPGRADE_LOCK() rw_try_upgrade(&cache_lock)
#define CACHE_RLOCK() rw_rlock(&cache_lock)
#define CACHE_RUNLOCK() rw_runlock(&cache_lock)
@@ -188,7 +207,12 @@ RW_SYSINIT(vfscache, &cache_lock, "Name Cache");
#define CACHE_WUNLOCK() rw_wunlock(&cache_lock)
static struct mtx_padalign ncneg_mtx;
-MTX_SYSINIT(vfscache_neg, &ncneg_mtx, "Name Cache neg", MTX_DEF);
+MTX_SYSINIT(vfscache_neg, &ncneg_mtx, "ncneg", MTX_DEF);
+
+static u_int numbucketlocks;
+static struct rwlock_padalign *bucketlocks;
+#define HASH2BUCKETLOCK(hash) \
+ ((struct rwlock *)(&bucketlocks[((hash) % numbucketlocks)]))
/*
* UMA zones for the VFS cache.
@@ -307,6 +331,8 @@ STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
static long numupgrades; STATNODE_ULONG(numupgrades,
"Number of updates of the cache after lookup (write lock + retry)");
+static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
+ "Number of times bucketlocked zap_and_exit case failed to writelock");
static void cache_zap(struct namecache *ncp);
static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
@@ -326,6 +352,39 @@ cache_get_hash(char *name, u_char len, struct vnode *dvp)
return (hash);
}
+#ifdef INVARIANTS
+static void
+cache_assert_bucket_locked(struct namecache *ncp, int mode)
+{
+ struct rwlock *bucketlock;
+ uint32_t hash;
+
+ hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp);
+ bucketlock = HASH2BUCKETLOCK(hash);
+ rw_assert(bucketlock, mode);
+}
+#else
+#define cache_assert_bucket_locked(x, y) do { } while (0)
+#endif
+
+static void
+cache_lock_all_buckets(void)
+{
+ u_int i;
+
+ for (i = 0; i < numbucketlocks; i++)
+ rw_wlock(&bucketlocks[i]);
+}
+
+static void
+cache_unlock_all_buckets(void)
+{
+ u_int i;
+
+ for (i = 0; i < numbucketlocks; i++)
+ rw_wunlock(&bucketlocks[i]);
+}
+
static int
sysctl_nchstats(SYSCTL_HANDLER_ARGS)
{
@@ -442,21 +501,13 @@ SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
* Negative entries management
*/
static void
-cache_negative_hit(struct namecache *ncp, int wlocked)
+cache_negative_hit(struct namecache *ncp)
{
- if (!wlocked) {
- rw_assert(&cache_lock, RA_RLOCKED);
- mtx_lock(&ncneg_mtx);
- } else {
- rw_assert(&cache_lock, RA_WLOCKED);
- }
-
+ mtx_lock(&ncneg_mtx);
TAILQ_REMOVE(&ncneg, ncp, nc_dst);
TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
-
- if (!wlocked)
- mtx_unlock(&ncneg_mtx);
+ mtx_unlock(&ncneg_mtx);
}
static void
@@ -464,9 +515,12 @@ cache_negative_insert(struct namecache *ncp)
{
rw_assert(&cache_lock, RA_WLOCKED);
+ cache_assert_bucket_locked(ncp, RA_WLOCKED);
MPASS(ncp->nc_vp == NULL);
+ mtx_lock(&ncneg_mtx);
TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
numneg++;
+ mtx_unlock(&ncneg_mtx);
}
static void
@@ -474,9 +528,12 @@ cache_negative_remove(struct namecache *ncp)
{
rw_assert(&cache_lock, RA_WLOCKED);
+ cache_assert_bucket_locked(ncp, RA_WLOCKED);
MPASS(ncp->nc_vp == NULL);
+ mtx_lock(&ncneg_mtx);
TAILQ_REMOVE(&ncneg, ncp, nc_dst);
numneg--;
+ mtx_unlock(&ncneg_mtx);
}
static struct namecache *
@@ -499,10 +556,11 @@ cache_negative_zap_one(void)
* pointer to a vnode or if it is just a negative cache entry.
*/
static void
-cache_zap(struct namecache *ncp)
+cache_zap_locked(struct namecache *ncp)
{
rw_assert(&cache_lock, RA_WLOCKED);
+ cache_assert_bucket_locked(ncp, RA_WLOCKED);
CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
if (ncp->nc_vp != NULL) {
SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
@@ -532,6 +590,21 @@ cache_zap(struct namecache *ncp)
numcache--;
}
+static void
+cache_zap(struct namecache *ncp)
+{
+ struct rwlock *bucketlock;
+ uint32_t hash;
+
+ rw_assert(&cache_lock, RA_WLOCKED);
+
+ hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp);
+ bucketlock = HASH2BUCKETLOCK(hash);
+ rw_wlock(bucketlock);
+ cache_zap_locked(ncp);
+ rw_wunlock(bucketlock);
+}
+
/*
* Lookup an entry in the cache
*
@@ -549,22 +622,42 @@ cache_zap(struct namecache *ncp)
* not recursively acquired.
*/
+enum { UNLOCKED, WLOCKED, RLOCKED };
+
+static void
+cache_unlock(int cache_locked)
+{
+
+ switch (cache_locked) {
+ case UNLOCKED:
+ break;
+ case WLOCKED:
+ CACHE_WUNLOCK();
+ break;
+ case RLOCKED:
+ CACHE_RUNLOCK();
+ break;
+ }
+}
+
int
cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
struct timespec *tsp, int *ticksp)
{
+ struct rwlock *bucketlock;
struct namecache *ncp;
uint32_t hash;
- int error, ltype, wlocked;
+ int error, ltype, cache_locked;
if (!doingcache) {
cnp->cn_flags &= ~MAKEENTRY;
return (0);
}
retry:
- wlocked = 0;
- counter_u64_add(numcalls, 1);
+ bucketlock = NULL;
+ cache_locked = UNLOCKED;
error = 0;
+ counter_u64_add(numcalls, 1);
retry_wlocked:
if (cnp->cn_nameptr[0] == '.') {
@@ -598,17 +691,21 @@ retry_wlocked:
}
return (-1);
}
- if (!wlocked)
- CACHE_RLOCK();
if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
counter_u64_add(dotdothits, 1);
+ if (cache_locked == UNLOCKED) {
+ CACHE_RLOCK();
+ cache_locked = RLOCKED;
+ }
+
if (dvp->v_cache_dd == NULL) {
SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
"..", NULL);
goto unlock;
}
if ((cnp->cn_flags & MAKEENTRY) == 0) {
- if (!wlocked && !CACHE_UPGRADE_LOCK())
+ if (cache_locked != WLOCKED &&
+ !CACHE_UPGRADE_LOCK())
goto wlock;
ncp = NULL;
if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT) {
@@ -639,10 +736,14 @@ retry_wlocked:
nc_dotdottime;
goto success;
}
- } else if (!wlocked)
- CACHE_RLOCK();
+ }
hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
+ if (cache_locked == UNLOCKED) {
+ bucketlock = HASH2BUCKETLOCK(hash);
+ rw_rlock(bucketlock);
+ }
+
LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
counter_u64_add(numchecks, 1);
if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
@@ -665,12 +766,7 @@ retry_wlocked:
/* We don't want to have an entry, so dump it */
if ((cnp->cn_flags & MAKEENTRY) == 0) {
counter_u64_add(numposzaps, 1);
- if (!wlocked && !CACHE_UPGRADE_LOCK())
- goto wlock;
- cache_zap(ncp);
- CACHE_WUNLOCK();
- cache_free(ncp);
- return (0);
+ goto zap_and_exit;
}
/* We found a "positive" match, return the vnode */
@@ -689,25 +785,20 @@ negative_success:
/* We found a negative match, and want to create it, so purge */
if (cnp->cn_nameiop == CREATE) {
counter_u64_add(numnegzaps, 1);
- if (!wlocked && !CACHE_UPGRADE_LOCK())
- goto wlock;
- cache_zap(ncp);
- CACHE_WUNLOCK();
- cache_free(ncp);
- return (0);
+ goto zap_and_exit;
}
counter_u64_add(numneghits, 1);
- cache_negative_hit(ncp, wlocked);
+ cache_negative_hit(ncp);
if (ncp->nc_flag & NCF_WHITE)
cnp->cn_flags |= ISWHITEOUT;
SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
nc_get_name(ncp));
cache_out_ts(ncp, tsp, ticksp);
- if (wlocked)
- CACHE_WUNLOCK();
- else
- CACHE_RUNLOCK();
+ MPASS(bucketlock != NULL || cache_locked != UNLOCKED);
+ if (bucketlock != NULL)
+ rw_runlock(bucketlock);
+ cache_unlock(cache_locked);
return (ENOENT);
wlock:
@@ -716,9 +807,10 @@ wlock:
* a write lock and retry the operation.
*/
CACHE_RUNLOCK();
+wlock_unlocked:
CACHE_WLOCK();
numupgrades++;
- wlocked = 1;
+ cache_locked = WLOCKED;
goto retry_wlocked;
success:
@@ -733,10 +825,10 @@ success:
VOP_UNLOCK(dvp, 0);
}
vhold(*vpp);
- if (wlocked)
- CACHE_WUNLOCK();
- else
- CACHE_RUNLOCK();
+ MPASS(bucketlock != NULL || cache_locked != UNLOCKED);
+ if (bucketlock != NULL)
+ rw_runlock(bucketlock);
+ cache_unlock(cache_locked);
error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread);
if (cnp->cn_flags & ISDOTDOT) {
vn_lock(dvp, ltype | LK_RETRY);
@@ -758,10 +850,29 @@ success:
return (-1);
unlock:
- if (wlocked)
- CACHE_WUNLOCK();
- else
- CACHE_RUNLOCK();
+ MPASS(bucketlock != NULL || cache_locked != UNLOCKED);
+ if (bucketlock != NULL)
+ rw_runlock(bucketlock);
+ cache_unlock(cache_locked);
+ return (0);
+
+zap_and_exit:
+ if (bucketlock != NULL) {
+ rw_assert(&cache_lock, RA_UNLOCKED);
+ if (!CACHE_TRY_WLOCK()) {
+ rw_runlock(bucketlock);
+ bucketlock = NULL;
+ zap_and_exit_bucket_fail++;
+ goto wlock_unlocked;
+ }
+ cache_locked = WLOCKED;
+ rw_runlock(bucketlock);
+ bucketlock = NULL;
+ } else if (cache_locked != WLOCKED && !CACHE_UPGRADE_LOCK())
+ goto wlock;
+ cache_zap(ncp);
+ CACHE_WUNLOCK();
+ cache_free(ncp);
return (0);
}
@@ -772,6 +883,7 @@ void
cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
struct timespec *tsp, struct timespec *dtsp)
{
+ struct rwlock *bucketlock;
struct namecache *ncp, *n2, *ndd, *nneg;
struct namecache_ts *n3;
struct nchashhead *ncpp;
@@ -924,11 +1036,6 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
}
}
- /*
- * Insert the new namecache entry into the appropriate chain
- * within the cache entries table.
- */
- LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
if (flag != NCF_ISDOTDOT) {
if (LIST_EMPTY(&dvp->v_cache_src)) {
vhold(dvp);
@@ -937,6 +1044,15 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
}
+ bucketlock = HASH2BUCKETLOCK(hash);
+ rw_wlock(bucketlock);
+
+ /*
+ * Insert the new namecache entry into the appropriate chain
+ * within the cache entries table.
+ */
+ LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
+
/*
* If the entry is "negative", we place it into the
* "negative" cache queue, otherwise, we place it into the
@@ -953,6 +1069,7 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
nc_get_name(ncp));
}
+ rw_wunlock(bucketlock);
if (numneg * ncnegfactor > numcache)
nneg = cache_negative_zap_one();
CACHE_WUNLOCK();
@@ -960,12 +1077,24 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
cache_free(nneg);
}
+static u_int
+cache_roundup_2(u_int val)
+{
+ u_int res;
+
+ for (res = 1; res <= val; res <<= 1)
+ continue;
+
+ return (res);
+}
+
/*
* Name cache initialization, from vfs_init() when we are booting
*/
static void
nchinit(void *dummy __unused)
{
+ u_int i;
TAILQ_INIT(&ncneg);
@@ -983,6 +1112,13 @@ nchinit(void *dummy __unused)
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
+ numbucketlocks = cache_roundup_2(mp_ncpus * 16);
+ if (numbucketlocks > nchash)
+ numbucketlocks = nchash;
+ bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
+ M_WAITOK | M_ZERO);
+ for (i = 0; i < numbucketlocks; i++)
+ rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK);
numcalls = counter_u64_alloc(M_WAITOK);
dothits = counter_u64_alloc(M_WAITOK);
@@ -1023,6 +1159,7 @@ cache_changesize(int newmaxvnodes)
* because to do so, they have to be removed from the hash table.
*/
CACHE_WLOCK();
+ cache_lock_all_buckets();
old_nchashtbl = nchashtbl;
old_nchash = nchash;
nchashtbl = new_nchashtbl;
@@ -1035,6 +1172,7 @@ cache_changesize(int newmaxvnodes)
LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
}
}
+ cache_unlock_all_buckets();
CACHE_WUNLOCK();
free(old_nchashtbl, M_VFSCACHE);
}
@@ -1108,20 +1246,30 @@ void
cache_purgevfs(struct mount *mp)
{
TAILQ_HEAD(, namecache) ncps;
- struct nchashhead *ncpp;
+ struct rwlock *bucketlock;
+ struct nchashhead *bucket;
struct namecache *ncp, *nnp;
+ u_long i, j, n_nchash;
/* Scan hash tables for applicable entries */
SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
TAILQ_INIT(&ncps);
CACHE_WLOCK();
- for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
- LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) {
- if (ncp->nc_dvp->v_mount != mp)
- continue;
- cache_zap(ncp);
- TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
+ n_nchash = nchash + 1;
+ for (i = 0; i < numbucketlocks; i++) {
+ bucketlock = (struct rwlock *)&bucketlocks[i];
+ rw_wlock(bucketlock);
+ for (j = i; j < n_nchash; j += numbucketlocks) {
+ bucket = &nchashtbl[j];
+ LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
+ cache_assert_bucket_locked(ncp, RA_WLOCKED);
+ if (ncp->nc_dvp->v_mount != mp)
+ continue;
+ cache_zap_locked(ncp);
+ TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
+ }
}
+ rw_wunlock(bucketlock);
}
CACHE_WUNLOCK();
TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {