diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index c1637515a8a41..acd7de85608b1 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -83,6 +83,7 @@ struct sk_psock { u32 cork_bytes; u32 eval; bool redir_ingress; /* undefined if sk_redir is null */ + bool redir_permanent; struct sk_msg *cork; struct sk_psock_progs progs; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 095ca7238ac20..ba43b35581bae 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3044,11 +3044,23 @@ union bpf_attr { * socket level. If the message *msg* is allowed to pass (i.e. if * the verdict eBPF program returns **SK_PASS**), redirect it to * the socket referenced by *map* (of type - * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and - * egress interfaces can be used for redirection. The - * **BPF_F_INGRESS** value in *flags* is used to make the - * distinction (ingress path is selected if the flag is present, - * egress path otherwise). This is the only flag supported for now. + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. + * + * The following *flags* are supported: + * + * **BPF_F_INGRESS** + * Both ingress and egress interfaces can be used for redirection. + * The **BPF_F_INGRESS** value in *flags* is used to make the + * distinction. Ingress path is selected if the flag is present, + * egress path otherwise. + * **BPF_F_PERMANENT** + * Indicates that redirect verdict and the target socket should be + * remembered. The verdict program will not be run for subsequent + * packets. + * + * **BPF_F_PERMANENT** cannot be use together with + * **bpf_msg_apply_bytes**\ () and **bpf_msg_cork_bytes**\ (). If + * **BPF_F_PERMANENT** is set apply_bytes and cork_bytes are ignored. * Return * **SK_PASS** on success, or **SK_DROP** on error. * @@ -3321,11 +3333,23 @@ union bpf_attr { * socket level. If the message *msg* is allowed to pass (i.e. if * the verdict eBPF program returns **SK_PASS**), redirect it to * the socket referenced by *map* (of type - * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and - * egress interfaces can be used for redirection. The - * **BPF_F_INGRESS** value in *flags* is used to make the - * distinction (ingress path is selected if the flag is present, - * egress path otherwise). This is the only flag supported for now. + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. + * + * The following *flags* are supported: + * + * **BPF_F_INGRESS** + * Both ingress and egress interfaces can be used for redirection. + * The **BPF_F_INGRESS** value in *flags* is used to make the + * distinction. Ingress path is selected if the flag is present, + * egress path otherwise. + * **BPF_F_PERMANENT** + * Indicates that redirect verdict and the target socket should be + * remembered. The verdict program will not be run for subsequent + * packets. + * + * **BPF_F_PERMANENT** cannot be use together with + * **bpf_msg_apply_bytes**\ () and **bpf_msg_cork_bytes**\ (). If + * **BPF_F_PERMANENT** is set apply_bytes and cork_bytes are ignored. * Return * **SK_PASS** on success, or **SK_DROP** on error. * @@ -5928,6 +5952,7 @@ enum { /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ enum { BPF_F_INGRESS = (1ULL << 0), + BPF_F_PERMANENT = (1ULL << 1), }; /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 6c31eefbd7778..22e55eec30890 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -878,7 +878,11 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, msg->sk = sk; ret = bpf_prog_run_pin_on_cpu(prog, msg); ret = sk_psock_map_verd(ret, msg->sk_redir); - psock->apply_bytes = msg->apply_bytes; + psock->redir_permanent = msg->flags & BPF_F_PERMANENT; + if (psock->redir_permanent) + msg->cork_bytes = msg->apply_bytes = 0; + else + psock->apply_bytes = msg->apply_bytes; if (ret == __SK_REDIRECT) { if (psock->sk_redir) { sock_put(psock->sk_redir); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 4292c2ed18286..b716b42486c5a 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -662,7 +662,7 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, { struct sock *sk; - if (unlikely(flags & ~(BPF_F_INGRESS))) + if (unlikely(flags & ~(BPF_F_INGRESS | BPF_F_PERMANENT))) return SK_DROP; sk = __sock_map_lookup_elem(map, key); @@ -1263,7 +1263,7 @@ BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, { struct sock *sk; - if (unlikely(flags & ~(BPF_F_INGRESS))) + if (unlikely(flags & ~(BPF_F_INGRESS | BPF_F_PERMANENT))) return SK_DROP; sk = __sock_hash_lookup_elem(map, key); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 53b0d62fd2c2d..3b3719a37a264 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -433,8 +433,10 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, if (!psock->apply_bytes) { /* Clean up before releasing the sock lock. */ eval = psock->eval; - psock->eval = __SK_NONE; - psock->sk_redir = NULL; + if (!psock->redir_permanent) { + psock->eval = __SK_NONE; + psock->sk_redir = NULL; + } } if (psock->cork) { cork = true; @@ -448,7 +450,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, msg, tosend, flags); sent = origsize - msg->sg.size; - if (eval == __SK_REDIRECT) + if (!psock->redir_permanent && eval == __SK_REDIRECT) sock_put(sk_redir); lock_sock(sk); @@ -474,8 +476,8 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, } if (likely(!ret)) { - if (!psock->apply_bytes) { - psock->eval = __SK_NONE; + if (!psock->apply_bytes && !psock->redir_permanent) { + psock->eval = __SK_NONE; if (psock->sk_redir) { sock_put(psock->sk_redir); psock->sk_redir = NULL; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 095ca7238ac20..ba43b35581bae 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3044,11 +3044,23 @@ union bpf_attr { * socket level. If the message *msg* is allowed to pass (i.e. if * the verdict eBPF program returns **SK_PASS**), redirect it to * the socket referenced by *map* (of type - * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and - * egress interfaces can be used for redirection. The - * **BPF_F_INGRESS** value in *flags* is used to make the - * distinction (ingress path is selected if the flag is present, - * egress path otherwise). This is the only flag supported for now. + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. + * + * The following *flags* are supported: + * + * **BPF_F_INGRESS** + * Both ingress and egress interfaces can be used for redirection. + * The **BPF_F_INGRESS** value in *flags* is used to make the + * distinction. Ingress path is selected if the flag is present, + * egress path otherwise. + * **BPF_F_PERMANENT** + * Indicates that redirect verdict and the target socket should be + * remembered. The verdict program will not be run for subsequent + * packets. + * + * **BPF_F_PERMANENT** cannot be use together with + * **bpf_msg_apply_bytes**\ () and **bpf_msg_cork_bytes**\ (). If + * **BPF_F_PERMANENT** is set apply_bytes and cork_bytes are ignored. * Return * **SK_PASS** on success, or **SK_DROP** on error. * @@ -3321,11 +3333,23 @@ union bpf_attr { * socket level. If the message *msg* is allowed to pass (i.e. if * the verdict eBPF program returns **SK_PASS**), redirect it to * the socket referenced by *map* (of type - * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and - * egress interfaces can be used for redirection. The - * **BPF_F_INGRESS** value in *flags* is used to make the - * distinction (ingress path is selected if the flag is present, - * egress path otherwise). This is the only flag supported for now. + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. + * + * The following *flags* are supported: + * + * **BPF_F_INGRESS** + * Both ingress and egress interfaces can be used for redirection. + * The **BPF_F_INGRESS** value in *flags* is used to make the + * distinction. Ingress path is selected if the flag is present, + * egress path otherwise. + * **BPF_F_PERMANENT** + * Indicates that redirect verdict and the target socket should be + * remembered. The verdict program will not be run for subsequent + * packets. + * + * **BPF_F_PERMANENT** cannot be use together with + * **bpf_msg_apply_bytes**\ () and **bpf_msg_cork_bytes**\ (). If + * **BPF_F_PERMANENT** is set apply_bytes and cork_bytes are ignored. * Return * **SK_PASS** on success, or **SK_DROP** on error. * @@ -5928,6 +5952,7 @@ enum { /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ enum { BPF_F_INGRESS = (1ULL << 0), + BPF_F_PERMANENT = (1ULL << 1), }; /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index f75f84d0b3d79..4d49129cdd6b0 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -12,6 +12,7 @@ #include "test_sockmap_progs_query.skel.h" #include "test_sockmap_pass_prog.skel.h" #include "test_sockmap_drop_prog.skel.h" +#include "test_sockmap_msg_verdict.skel.h" #include "bpf_iter_sockmap.skel.h" #include "sockmap_helpers.h" @@ -524,6 +525,107 @@ static void test_sockmap_skb_verdict_peek(void) test_sockmap_pass_prog__destroy(pass); } +static void test_sockmap_msg_verdict(bool is_ingress, bool is_permanent, bool is_self, + bool target_shutdown) +{ + int key, sent, recvd, recv_fd, target_fd; + int err, map, verdict, s, c0, c1, p0, p1; + struct test_sockmap_msg_verdict *skel; + char buf[256] = "0123456789"; + + skel = test_sockmap_msg_verdict__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + verdict = bpf_program__fd(skel->progs.prog_skmsg_verdict); + map = bpf_map__fd(skel->maps.sock_map); + + + err = bpf_prog_attach(verdict, map, BPF_SK_MSG_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto out; + + s = socket_loopback(AF_INET, SOCK_STREAM); + if (!ASSERT_GT(s, -1, "socket_loopback(s)")) + goto out; + err = create_socket_pairs(s, AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1); + if (!ASSERT_OK(err, "create_socket_pairs(s)")) + goto out; + + key = 0; + err = bpf_map_update_elem(map, &key, &p1, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem(key0)")) + goto out_close; + key = 1; + err = bpf_map_update_elem(map, &key, &c1, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem(key1)")) + goto out_close; + key = 2; + err = bpf_map_update_elem(map, &key, &p0, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem(key2)")) + goto out_close; + key = 3; + err = bpf_map_update_elem(map, &key, &c0, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem(key3)")) + goto out_close; + + if (is_ingress) { + skel->bss->skmsg_redir_flags = BPF_F_INGRESS; + if (is_self) { + skel->bss->skmsg_redir_key = 0; + target_fd = p1; + recv_fd = p1; + } else { + skel->bss->skmsg_redir_key = 1; + target_fd = c1; + recv_fd = c1; + } + } else { + skel->bss->skmsg_redir_flags = 0; + if (is_self) { + skel->bss->skmsg_redir_key = 0; + target_fd = p1; + recv_fd = c1; + } else { + skel->bss->skmsg_redir_key = 2; + target_fd = p0; + recv_fd = c0; + } + } + + if (is_permanent) + skel->bss->skmsg_redir_flags |= BPF_F_PERMANENT; + + sent = xsend(p1, &buf, sizeof(buf), 0); + ASSERT_EQ(sent, sizeof(buf), "xsend(p1)"); + recvd = recv_timeout(recv_fd, &buf, sizeof(buf), SOCK_NONBLOCK, IO_TIMEOUT_SEC); + ASSERT_EQ(recvd, sizeof(buf), "recv_timeout(recv_fd)"); + + if (target_shutdown) { + signal(SIGPIPE, SIG_IGN); + close(target_fd); + sent = send(p1, &buf, sizeof(buf), 0); + if (is_permanent) { + ASSERT_EQ(sent, -1, "xsend(p1)"); + ASSERT_EQ(errno, EPIPE, "xsend(p1)"); + } else { + ASSERT_EQ(sent, sizeof(buf), "xsend(p1)"); + } + goto out_close; + } + + sent = xsend(p1, &buf, sizeof(buf), 0); + ASSERT_EQ(sent, sizeof(buf), "xsend(p1)"); + recvd = recv_timeout(recv_fd, &buf, sizeof(buf), SOCK_NONBLOCK, IO_TIMEOUT_SEC); + ASSERT_EQ(recvd, sizeof(buf), "recv_timeout(recv_fd)"); +out_close: + close(c0); + close(p0); + close(c1); + close(p1); +out: + test_sockmap_msg_verdict__destroy(skel); +} + void test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) @@ -566,4 +668,24 @@ void test_sockmap_basic(void) test_sockmap_skb_verdict_fionread(false); if (test__start_subtest("sockmap skb_verdict msg_f_peek")) test_sockmap_skb_verdict_peek(); + if (test__start_subtest("sockmap msg_verdict")) + test_sockmap_msg_verdict(false, false, false, false); + if (test__start_subtest("sockmap msg_verdict ingress")) + test_sockmap_msg_verdict(true, false, false, false); + if (test__start_subtest("sockmap msg_verdict permanent")) + test_sockmap_msg_verdict(false, true, false, false); + if (test__start_subtest("sockmap msg_verdict ingress permanent")) + test_sockmap_msg_verdict(true, true, false, false); + if (test__start_subtest("sockmap msg_verdict permanent self")) + test_sockmap_msg_verdict(false, true, true, false); + if (test__start_subtest("sockmap msg_verdict ingress permanent self")) + test_sockmap_msg_verdict(true, true, true, false); + if (test__start_subtest("sockmap msg_verdict permanent shutdown")) + test_sockmap_msg_verdict(false, true, false, true); + if (test__start_subtest("sockmap msg_verdict ingress permanent shutdown")) + test_sockmap_msg_verdict(true, true, false, true); + if (test__start_subtest("sockmap msg_verdict shutdown")) + test_sockmap_msg_verdict(false, false, false, true); + if (test__start_subtest("sockmap msg_verdict ingress shutdown")) + test_sockmap_msg_verdict(true, false, false, true); } diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h index 99d2ea9fb658f..b0a2ddd55b834 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h +++ b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h @@ -298,8 +298,9 @@ int bpf_prog6(struct sk_msg_md *msg) f = bpf_map_lookup_elem(&sock_redir_flags, &zero); if (f && *f) { - key = 2; flags = *f; + if (flags & BPF_F_INGRESS) + key = 2; } #ifdef SOCKMAP return bpf_msg_redirect_map(msg, &sock_map_redir, key, flags); diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_msg_verdict.c b/tools/testing/selftests/bpf/progs/test_sockmap_msg_verdict.c new file mode 100644 index 0000000000000..002b76a1ae35c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sockmap_msg_verdict.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 4); + __type(key, int); + __type(value, int); +} sock_map SEC(".maps"); + +u64 skmsg_redir_flags = 0; +u32 skmsg_redir_key = 0; + +SEC("sk_msg") +int prog_skmsg_verdict(struct sk_msg_md *msg) +{ + u64 flags = skmsg_redir_flags; + int key = skmsg_redir_key; + + bpf_msg_redirect_map(msg, &sock_map, key, flags); + return SK_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 024a0faafb3be..c602ac8780a82 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -77,6 +77,7 @@ int txmsg_end_push; int txmsg_start_pop; int txmsg_pop; int txmsg_ingress; +int txmsg_permanent; int txmsg_redir_skb; int txmsg_ktls_skb; int txmsg_ktls_skb_drop; @@ -107,6 +108,7 @@ static const struct option long_options[] = { {"txmsg_start_pop", required_argument, NULL, 'w'}, {"txmsg_pop", required_argument, NULL, 'x'}, {"txmsg_ingress", no_argument, &txmsg_ingress, 1 }, + {"txmsg_permanent", no_argument, &txmsg_permanent, 1 }, {"txmsg_redir_skb", no_argument, &txmsg_redir_skb, 1 }, {"ktls", no_argument, &ktls, 1 }, {"peek", no_argument, &peek_flag, 1 }, @@ -175,7 +177,7 @@ static void test_reset(void) txmsg_start_push = txmsg_end_push = 0; txmsg_pass = txmsg_drop = txmsg_redir = 0; txmsg_apply = txmsg_cork = 0; - txmsg_ingress = txmsg_redir_skb = 0; + txmsg_ingress = txmsg_permanent = txmsg_redir_skb = 0; txmsg_ktls_skb = txmsg_ktls_skb_drop = txmsg_ktls_skb_redir = 0; txmsg_omit_skb_parser = 0; skb_use_parser = 0; @@ -1164,11 +1166,27 @@ static int run_options(struct sockmap_options *options, int cg_fd, int test) } + if (txmsg_permanent) { + int txmsg_flag = BPF_F_PERMANENT; + + i = 0; + err = bpf_map_update_elem(map_fd[6], &i, &txmsg_flag, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (txmsg_permanent): %d (%s)\n", + err, strerror(errno)); + goto out; + } + } + if (txmsg_ingress) { - int in = BPF_F_INGRESS; + int txmsg_flag = BPF_F_INGRESS; + + if (txmsg_permanent) + txmsg_flag |= BPF_F_PERMANENT; i = 0; - err = bpf_map_update_elem(map_fd[6], &i, &in, BPF_ANY); + err = bpf_map_update_elem(map_fd[6], &i, &txmsg_flag, BPF_ANY); if (err) { fprintf(stderr, "ERROR: bpf_map_update_elem (txmsg_ingress): %d (%s)\n", @@ -1485,6 +1503,13 @@ static void test_txmsg_redir(int cgrp, struct sockmap_options *opt) test_send(opt, cgrp); } +static void test_txmsg_redir_permanent(int cgrp, struct sockmap_options *opt) +{ + txmsg_redir = 1; + txmsg_permanent = 1; + test_send(opt, cgrp); +} + static void test_txmsg_redir_wait_sndmem(int cgrp, struct sockmap_options *opt) { txmsg_redir = 1; @@ -1506,6 +1531,14 @@ static void test_txmsg_ingress_redir(int cgrp, struct sockmap_options *opt) test_send(opt, cgrp); } +static void test_txmsg_ingress_redir_permanent(int cgrp, struct sockmap_options *opt) +{ + txmsg_pass = txmsg_drop = 0; + txmsg_ingress = txmsg_redir = 1; + txmsg_permanent = 1; + test_send(opt, cgrp); +} + static void test_txmsg_skb(int cgrp, struct sockmap_options *opt) { bool data = opt->data_test; @@ -1859,9 +1892,11 @@ static int populate_progs(char *bpf_file) struct _test test[] = { {"txmsg test passthrough", test_txmsg_pass}, {"txmsg test redirect", test_txmsg_redir}, + {"txmsg test redirect permanent", test_txmsg_redir_permanent}, {"txmsg test redirect wait send mem", test_txmsg_redir_wait_sndmem}, {"txmsg test drop", test_txmsg_drop}, {"txmsg test ingress redirect", test_txmsg_ingress_redir}, + {"txmsg test ingress redirect permanent", test_txmsg_ingress_redir_permanent}, {"txmsg test skb", test_txmsg_skb}, {"txmsg test apply", test_txmsg_apply}, {"txmsg test cork", test_txmsg_cork},