Skip to content

Commit 2e3fd24

Browse files
JanScheurichNitin Katiyar
authored andcommitted
ofproto-dpif: Improve dp_hash selection method for select groups
The current implementation of the "dp_hash" selection method suffers from two deficiences: 1. The hash mask and hence the number of dp_hash values is just large enough to cover the number of group buckets, but does not consider the case that buckets have different weights. 2. The xlate-time selection of best bucket from the masked dp_hash value often results in bucket load distributions that are quite different from the bucket weights because the number of available masked dp_hash values is too small (2-6 bits compared to 32 bits of a full hash in the default hash selection method). This commit provides a more accurate implementation of the dp_hash select group by applying the well known Webster method for distributing a small number of "seats" fairly over the weighted "parties" (see https://en.wikipedia.org/wiki/Webster/Sainte-Lagu%C3%AB_method). The dp_hash mask is autmatically chosen large enough to provide good enough accuracy even with widely differing weights. This distribution happens at group modification time and the resulting table is stored with the group-dpif struct. At xlation time, we use the masked dp_hash values as index to look up the assigned bucket. If the bucket should not be live, we do a circular search over the mapping table until we find the first live bucket. As the buckets in the table are by construction in pseudo-random order with a frequency according to their weight, this method maintains correct distribution even if one or more buckets are non-live. Xlation is further simplified by storing some derived select group state at group construction in struct group-dpif in a form better suited for xlation purposes. Adapted the unit test case for dp_hash select group accordingly. Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com> Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Co-authored-by: Nitin Katiyar <nitin.katiyar@ericsson.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
1 parent 6a0b0d3 commit 2e3fd24

File tree

5 files changed

+211
-30
lines changed

5 files changed

+211
-30
lines changed

lib/odp-util.c

+3-1
Original file line numberDiff line numberDiff line change
@@ -595,7 +595,9 @@ format_odp_hash_action(struct ds *ds, const struct ovs_action_hash *hash_act)
595595
ds_put_format(ds, "hash(");
596596

597597
if (hash_act->hash_alg == OVS_HASH_ALG_L4) {
598-
ds_put_format(ds, "hash_l4(%"PRIu32")", hash_act->hash_basis);
598+
ds_put_format(ds, "l4(%"PRIu32")", hash_act->hash_basis);
599+
} else if (hash_act->hash_alg == OVS_HASH_ALG_SYM_L4) {
600+
ds_put_format(ds, "sym_l4(%"PRIu32")", hash_act->hash_basis);
599601
} else {
600602
ds_put_format(ds, "Unknown hash algorithm(%"PRIu32")",
601603
hash_act->hash_alg);

ofproto/ofproto-dpif-xlate.c

+37-22
Original file line numberDiff line numberDiff line change
@@ -4392,27 +4392,37 @@ pick_hash_fields_select_group(struct xlate_ctx *ctx, struct group_dpif *group)
43924392
static struct ofputil_bucket *
43934393
pick_dp_hash_select_group(struct xlate_ctx *ctx, struct group_dpif *group)
43944394
{
4395+
uint32_t dp_hash = ctx->xin->flow.dp_hash;
4396+
43954397
/* dp_hash value 0 is special since it means that the dp_hash has not been
43964398
* computed, as all computed dp_hash values are non-zero. Therefore
43974399
* compare to zero can be used to decide if the dp_hash value is valid
43984400
* without masking the dp_hash field. */
4399-
if (!ctx->xin->flow.dp_hash) {
4400-
uint64_t param = group->up.props.selection_method_param;
4401-
4402-
ctx_trigger_recirculate_with_hash(ctx, param >> 32, (uint32_t)param);
4401+
if (!dp_hash) {
4402+
enum ovs_hash_alg hash_alg = group->hash_alg;
4403+
if (hash_alg > ctx->xbridge->support.max_hash_alg) {
4404+
/* Algorithm supported by all datapaths. */
4405+
hash_alg = OVS_HASH_ALG_L4;
4406+
}
4407+
ctx_trigger_recirculate_with_hash(ctx, hash_alg, group->hash_basis);
44034408
return NULL;
44044409
} else {
4405-
uint32_t n_buckets = group->up.n_buckets;
4406-
if (n_buckets) {
4407-
/* Minimal mask to cover the number of buckets. */
4408-
uint32_t mask = (1 << log_2_ceil(n_buckets)) - 1;
4409-
/* Multiplier chosen to make the trivial 1 bit case to
4410-
* actually distribute amongst two equal weight buckets. */
4411-
uint32_t basis = 0xc2b73583 * (ctx->xin->flow.dp_hash & mask);
4412-
4413-
ctx->wc->masks.dp_hash |= mask;
4414-
return group_best_live_bucket(ctx, group, basis);
4410+
uint32_t hash_mask = group->hash_mask;
4411+
ctx->wc->masks.dp_hash |= hash_mask;
4412+
4413+
/* Starting from the original masked dp_hash value iterate over the
4414+
* hash mapping table to find the first live bucket. As the buckets
4415+
* are quasi-randomly spread over the hash values, this maintains
4416+
* a distribution according to bucket weights even when some buckets
4417+
* are non-live. */
4418+
for (int i = 0; i <= hash_mask; i++) {
4419+
struct ofputil_bucket *b =
4420+
group->hash_map[(dp_hash + i) & hash_mask];
4421+
if (bucket_is_alive(ctx, b, 0)) {
4422+
return b;
4423+
}
44154424
}
4425+
44164426
return NULL;
44174427
}
44184428
}
@@ -4427,17 +4437,22 @@ pick_select_group(struct xlate_ctx *ctx, struct group_dpif *group)
44274437
ctx_trigger_freeze(ctx);
44284438
}
44294439

4430-
const char *selection_method = group->up.props.selection_method;
4431-
if (selection_method[0] == '\0') {
4440+
switch (group->selection_method) {
4441+
case SEL_METHOD_DEFAULT:
44324442
return pick_default_select_group(ctx, group);
4433-
} else if (!strcasecmp("hash", selection_method)) {
4443+
break;
4444+
case SEL_METHOD_HASH:
44344445
return pick_hash_fields_select_group(ctx, group);
4435-
} else if (!strcasecmp("dp_hash", selection_method)) {
4446+
break;
4447+
case SEL_METHOD_DP_HASH:
44364448
return pick_dp_hash_select_group(ctx, group);
4437-
} else {
4438-
/* Parsing of groups should ensure this never happens */
4449+
break;
4450+
default:
4451+
/* Parsing of groups ensures this never happens */
44394452
OVS_NOT_REACHED();
44404453
}
4454+
4455+
return NULL;
44414456
}
44424457

44434458
static void
@@ -4731,8 +4746,8 @@ finish_freezing__(struct xlate_ctx *ctx, uint8_t table)
47314746
act_hash = nl_msg_put_unspec_uninit(ctx->odp_actions,
47324747
OVS_ACTION_ATTR_HASH,
47334748
sizeof *act_hash);
4734-
act_hash->hash_alg = OVS_HASH_ALG_L4; /* Make configurable. */
4735-
act_hash->hash_basis = 0; /* Make configurable. */
4749+
act_hash->hash_alg = ctx->dp_hash_alg;
4750+
act_hash->hash_basis = ctx->dp_hash_basis;
47364751
}
47374752
nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_RECIRC, recirc_id);
47384753
}

ofproto/ofproto-dpif.c

+150
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "lacp.h"
3333
#include "learn.h"
3434
#include "mac-learning.h"
35+
#include "math.h"
3536
#include "mcast-snooping.h"
3637
#include "multipath.h"
3738
#include "netdev-vport.h"
@@ -4762,6 +4763,147 @@ group_dpif_credit_stats(struct group_dpif *group,
47624763
ovs_mutex_unlock(&group->stats_mutex);
47634764
}
47644765

4766+
/* Calculate the dp_hash mask needed to provide the least weighted bucket
4767+
* with at least one hash value and construct a mapping table from masked
4768+
* dp_hash value to group bucket using the Webster method.
4769+
* If the caller specifies a non-zero max_hash value, abort and return false
4770+
* if more hash values would be required. The absolute maximum number of
4771+
* hash values supported is 256. */
4772+
4773+
#define MAX_SELECT_GROUP_HASH_VALUES 256
4774+
4775+
static bool
4776+
group_setup_dp_hash_table(struct group_dpif *group, size_t max_hash)
4777+
{
4778+
struct ofputil_bucket *bucket;
4779+
uint32_t n_buckets = group->up.n_buckets;
4780+
uint64_t total_weight = 0;
4781+
uint16_t min_weight = UINT16_MAX;
4782+
struct webster {
4783+
struct ofputil_bucket *bucket;
4784+
uint32_t divisor;
4785+
double value;
4786+
int hits;
4787+
} *webster;
4788+
4789+
if (n_buckets == 0) {
4790+
VLOG_DBG(" Don't apply dp_hash method without buckets");
4791+
return false;
4792+
}
4793+
4794+
webster = xcalloc(n_buckets, sizeof(struct webster));
4795+
int i = 0;
4796+
LIST_FOR_EACH (bucket, list_node, &group->up.buckets) {
4797+
if (bucket->weight > 0 && bucket->weight < min_weight) {
4798+
min_weight = bucket->weight;
4799+
}
4800+
total_weight += bucket->weight;
4801+
webster[i].bucket = bucket;
4802+
webster[i].divisor = 1;
4803+
webster[i].value = bucket->weight;
4804+
webster[i].hits = 0;
4805+
i++;
4806+
}
4807+
4808+
if (total_weight == 0) {
4809+
VLOG_DBG(" Total weight is zero. No active buckets.");
4810+
free(webster);
4811+
return false;
4812+
}
4813+
VLOG_DBG(" Minimum weight: %d, total weight: %"PRIu64,
4814+
min_weight, total_weight);
4815+
4816+
uint64_t min_slots = DIV_ROUND_UP(total_weight, min_weight);
4817+
uint64_t min_slots2 = ROUND_UP_POW2(min_slots);
4818+
uint64_t n_hash = MAX(16, min_slots2);
4819+
if (n_hash > MAX_SELECT_GROUP_HASH_VALUES ||
4820+
(max_hash != 0 && n_hash > max_hash)) {
4821+
VLOG_DBG(" Too many hash values required: %"PRIu64, n_hash);
4822+
return false;
4823+
}
4824+
4825+
VLOG_DBG(" Using %"PRIu64" hash values:", n_hash);
4826+
group->hash_mask = n_hash - 1;
4827+
if (group->hash_map) {
4828+
free(group->hash_map);
4829+
}
4830+
group->hash_map = xcalloc(n_hash, sizeof(struct ofputil_bucket *));
4831+
4832+
/* Use Webster method to distribute hash values over buckets. */
4833+
for (int hash = 0; hash < n_hash; hash++) {
4834+
struct webster *winner = &webster[0];
4835+
for (i = 1; i < n_buckets; i++) {
4836+
if (webster[i].value > winner->value) {
4837+
winner = &webster[i];
4838+
}
4839+
}
4840+
winner->hits++;
4841+
winner->divisor += 2;
4842+
winner->value = (double) winner->bucket->weight / winner->divisor;
4843+
group->hash_map[hash] = winner->bucket;
4844+
}
4845+
4846+
i = 0;
4847+
LIST_FOR_EACH (bucket, list_node, &group->up.buckets) {
4848+
double target = (n_hash * bucket->weight) / (double) total_weight;
4849+
VLOG_DBG(" Bucket %d: weight=%d, target=%.2f hits=%d",
4850+
bucket->bucket_id, bucket->weight,
4851+
target, webster[i].hits);
4852+
i++;
4853+
}
4854+
4855+
free(webster);
4856+
return true;
4857+
}
4858+
4859+
static void
4860+
group_set_selection_method(struct group_dpif *group)
4861+
{
4862+
const struct ofputil_group_props *props = &group->up.props;
4863+
const char *selection_method = props->selection_method;
4864+
4865+
if (selection_method[0] == '\0') {
4866+
VLOG_DBG("No selection method specified.");
4867+
group->selection_method = SEL_METHOD_DEFAULT;
4868+
} else if (!strcmp(selection_method, "dp_hash")) {
4869+
VLOG_DBG("Selection method specified: dp_hash.");
4870+
/* Try to use dp_hash if possible at all. */
4871+
if (group_setup_dp_hash_table(group, 0)) {
4872+
group->selection_method = SEL_METHOD_DP_HASH;
4873+
group->hash_alg = props->selection_method_param >> 32;
4874+
if (group->hash_alg >= __OVS_HASH_MAX) {
4875+
VLOG_DBG(" Invalid dp_hash algorithm %d. "
4876+
"Defaulting to OVS_HASH_ALG_L4", group->hash_alg);
4877+
group->hash_alg = OVS_HASH_ALG_L4;
4878+
}
4879+
group->hash_basis = (uint32_t) props->selection_method_param;
4880+
VLOG_DBG("Use dp_hash with %d hash values using algorithm %d.",
4881+
group->hash_mask + 1, group->hash_alg);
4882+
} else {
4883+
/* Fall back to original default hashing in slow path. */
4884+
VLOG_DBG(" Falling back to default hash method.");
4885+
group->selection_method = SEL_METHOD_DEFAULT;
4886+
}
4887+
} else if (!strcmp(selection_method, "hash")) {
4888+
VLOG_DBG("Selection method specified: hash.");
4889+
if (props->fields.values_size > 0) {
4890+
/* Controller has specified hash fields. */
4891+
struct ds s = DS_EMPTY_INITIALIZER;
4892+
oxm_format_field_array(&s, &props->fields);
4893+
VLOG_DBG(" Hash fields: %s", ds_cstr(&s));
4894+
ds_destroy(&s);
4895+
group->selection_method = SEL_METHOD_HASH;
4896+
} else {
4897+
/* No hash fields. Fall back to original default hashing. */
4898+
VLOG_DBG(" No hash fields. Falling back to default hash method.");
4899+
group->selection_method = SEL_METHOD_DEFAULT;
4900+
}
4901+
} else {
4902+
/* Parsing of groups should ensure this never happens */
4903+
OVS_NOT_REACHED();
4904+
}
4905+
}
4906+
47654907
static enum ofperr
47664908
group_construct(struct ofgroup *group_)
47674909
{
@@ -4770,6 +4912,10 @@ group_construct(struct ofgroup *group_)
47704912
ovs_mutex_init_adaptive(&group->stats_mutex);
47714913
ovs_mutex_lock(&group->stats_mutex);
47724914
group_construct_stats(group);
4915+
group->hash_map = NULL;
4916+
if (group->up.type == OFPGT11_SELECT) {
4917+
group_set_selection_method(group);
4918+
}
47734919
ovs_mutex_unlock(&group->stats_mutex);
47744920
return 0;
47754921
}
@@ -4779,6 +4925,10 @@ group_destruct(struct ofgroup *group_)
47794925
{
47804926
struct group_dpif *group = group_dpif_cast(group_);
47814927
ovs_mutex_destroy(&group->stats_mutex);
4928+
if (group->hash_map) {
4929+
free(group->hash_map);
4930+
group->hash_map = NULL;
4931+
}
47824932
}
47834933

47844934
static enum ofperr

ofproto/ofproto-dpif.h

+13
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,12 @@ rule_dpif_is_internal(const struct rule_dpif *rule)
119119

120120
/* Groups. */
121121

122+
enum group_selection_method {
123+
SEL_METHOD_DEFAULT,
124+
SEL_METHOD_DP_HASH,
125+
SEL_METHOD_HASH,
126+
};
127+
122128
struct group_dpif {
123129
struct ofgroup up;
124130

@@ -129,6 +135,12 @@ struct group_dpif {
129135
struct ovs_mutex stats_mutex;
130136
uint64_t packet_count OVS_GUARDED; /* Number of packets received. */
131137
uint64_t byte_count OVS_GUARDED; /* Number of bytes received. */
138+
139+
enum group_selection_method selection_method;
140+
enum ovs_hash_alg hash_alg; /* dp_hash algorithm to be applied. */
141+
uint32_t hash_basis; /* Basis for dp_hash. */
142+
uint32_t hash_mask; /* Used to mask dp_hash (2^N - 1).*/
143+
struct ofputil_bucket **hash_map; /* Map hash values to buckets. */
132144
};
133145

134146
void group_dpif_credit_stats(struct group_dpif *,
@@ -137,6 +149,7 @@ void group_dpif_credit_stats(struct group_dpif *,
137149
struct group_dpif *group_dpif_lookup(struct ofproto_dpif *,
138150
uint32_t group_id, ovs_version_t version,
139151
bool take_ref);
152+
140153

141154
/* Backers.
142155
*

tests/ofproto-dpif.at

+8-7
Original file line numberDiff line numberDiff line change
@@ -500,11 +500,12 @@ for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do
500500
AT_CHECK([ovs-appctl netdev-dummy/receive p1 $pkt])
501501
done
502502

503-
AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0x1)/dp_hash(0xXXXX\/0x1)/' | sed 's/packets.*actions:1/actions:1/' | strip_ufid | strip_used | sort], [0], [dnl
503+
AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0xf)/dp_hash(0xXXXX\/0xf)/' | sed 's/packets.*actions:1/actions:1/' | \
504+
strip_ufid | strip_used | sort | uniq], [0], [dnl
504505
flow-dump from non-dpdk interfaces:
505-
recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no), packets:15, bytes:1590, used:0.0s, actions:hash(hash_l4(0)),recirc(0x1)
506-
recirc_id(0x1),dp_hash(0xXXXX/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10
507-
recirc_id(0x1),dp_hash(0xXXXX/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:11
506+
recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no), packets:15, bytes:1590, used:0.0s, actions:hash(l4(0)),recirc(0x1)
507+
recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10
508+
recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:11
508509
])
509510

510511
AT_CHECK([ovs-appctl revalidator/purge], [0])
@@ -516,10 +517,10 @@ for d in 0 1 2 3 4 5 6 7 8 9 a b c d e f; do
516517
AT_CHECK([ovs-appctl netdev-dummy/receive p1 $pkt])
517518
done
518519

519-
AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0x1)/dp_hash(0xXXXX\/0x1)/' | sed 's/\(actions:1\)[[01]]/\1X/' | strip_ufid | strip_used | sort], [0], [dnl
520+
AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0xf)/dp_hash(0xXXXX\/0xf)/' | sed 's/\(actions:1\)[[01]]/\1X/' | strip_ufid | strip_used | sort], [0], [dnl
520521
flow-dump from non-dpdk interfaces:
521-
recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no), packets:15, bytes:1590, used:0.0s, actions:hash(hash_l4(0)),recirc(0x2)
522-
recirc_id(0x2),dp_hash(0xXXXX/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:15, bytes:1590, used:0.0s, actions:1X
522+
recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no), packets:15, bytes:1590, used:0.0s, actions:hash(l4(0)),recirc(0x2)
523+
recirc_id(0x2),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:15, bytes:1590, used:0.0s, actions:1X
523524
])
524525

525526
OVS_VSWITCHD_STOP

0 commit comments

Comments
 (0)