Skip to content

Commit

Permalink
explicit multipathing in v2b
Browse files Browse the repository at this point in the history
  • Loading branch information
rcgoodfellow committed Jan 4, 2024
1 parent 82ea7ca commit 8c6326c
Show file tree
Hide file tree
Showing 9 changed files with 91 additions and 86 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ version_check = "0.9"
zerocopy = { version = "0.7", features = ["derive"] }
zone = { git = "https://github.com/oxidecomputer/zone" }
ztest = { git = "https://github.com/oxidecomputer/falcon", branch = "main" }
poptrie = { git = "https://github.com/oxidecomputer/poptrie" }
poptrie = { git = "https://github.com/oxidecomputer/poptrie", branch = "multipath" }

[profile.release]
debug = 2
10 changes: 5 additions & 5 deletions bin/opteadm/src/bin/opteadm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ enum Command {
SetV2P { vpc_ip: IpAddr, vpc_mac: MacAddr, underlay_ip: Ipv6Addr, vni: Vni },

/// Set a virtual-to-boundary mapping
SetV2B { prefix: IpCidr, tunnel_endpoint: Ipv6Addr },
SetV2B { prefix: IpCidr, tunnel_endpoint: Vec<Ipv6Addr> },

/// Clear a virtual-to-boundary mapping
ClearV2B { vpc_ip: IpCidr },
Expand Down Expand Up @@ -623,10 +623,10 @@ fn main() -> anyhow::Result<()> {

Command::SetV2B { prefix, tunnel_endpoint } => {
let hdl = opteadm::OpteAdm::open(OpteAdm::XDE_CTL)?;
let tep = TunnelEndpoint {
ip: tunnel_endpoint,
vni: Vni::new(99u32).unwrap(),
};
let tep = tunnel_endpoint
.into_iter()
.map(|ip| TunnelEndpoint { ip, vni: Vni::new(99u32).unwrap() })
.collect();
let req = SetVirt2BoundaryReq { vip: prefix, tep };
hdl.set_v2b(&req)?;
}
Expand Down
31 changes: 31 additions & 0 deletions lib/opte/src/engine/packet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ use core::fmt::Display;
use core::ptr;
use core::result;
use core::slice;
use crc32fast::Hasher;
use dyn_clone::DynClone;
use serde::Deserialize;
use serde::Serialize;
Expand Down Expand Up @@ -325,6 +326,36 @@ impl PacketMeta {
_ => None,
}
}

pub fn l4_hash(&self) -> Option<u32> {
let ulp = match self.inner.ulp {
Some(ulp) => ulp,
None => return None,
};
let mut h = Hasher::new();
match &self.inner.ip {
Some(IpMeta::Ip4(m)) => {
h.update(&m.src.bytes());
h.update(&m.dst.bytes());
h.update(&[u8::from(m.proto)]);
}
Some(IpMeta::Ip6(m)) => {
h.update(&m.src.bytes());
h.update(&m.dst.bytes());
h.update(&[u8::from(m.proto)]);
}
None => return None,
};
let (src, dst) = match ulp {
UlpMeta::Tcp(t) => (t.src, t.dst),
UlpMeta::Udp(u) => (u.src, u.dst),
UlpMeta::Icmpv4(_) => (0, 0), //TODO use icmp id
UlpMeta::Icmpv6(_) => (0, 0), //TODO use icmp id
};
h.update(&src.to_be_bytes());
h.update(&dst.to_be_bytes());
Some(h.finalize())
}
}

/// A network packet.
Expand Down
2 changes: 1 addition & 1 deletion lib/oxide-vpc/src/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,7 @@ pub struct SetVirt2PhysReq {
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct SetVirt2BoundaryReq {
pub vip: IpCidr,
pub tep: TunnelEndpoint,
pub tep: Vec<TunnelEndpoint>,
}

/// Clear a mapping from VPC IP to a boundary tunnel endpoint destination.
Expand Down
64 changes: 40 additions & 24 deletions lib/oxide-vpc/src/engine/overlay.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ impl StaticAction for EncapAction {
// The encap action is only used for outgoing.
_dir: Direction,
flow_id: &InnerFlowId,
_pkt_meta: &PacketMeta,
pkt_meta: &PacketMeta,
action_meta: &mut ActionMeta,
) -> GenHtResult {
// The router layer determines a RouterTarget and stores it in
Expand Down Expand Up @@ -236,11 +236,27 @@ impl StaticAction for EncapAction {
let phys_target = match target {
RouterTargetInternal::InternetGateway => {
match self.v2b.get(&flow_id.dst_ip) {
Some(phys) => PhysNet {
ether: MacAddr::from(INTERNET_GATEWAY_MAC),
ip: phys.ip,
vni: Vni::new(BOUNDARY_SERVICES_VNI).unwrap(),
},
Some(phys) => {
// Hash the packet onto a route target. This is a very
// rudimentary mechanism. Should level-up to an ECMP
// algorithm with well known statistical properties.
let hash = match pkt_meta.l4_hash() {
Some(h) => h,
None => {
return Err(GenHtError::Unexpected {
msg: "could not compute l4 hash for packet"
.to_string(),
});
}
};
let hash = hash as usize;
let ip = phys[hash % phys.len()].ip;
PhysNet {
ether: MacAddr::from(INTERNET_GATEWAY_MAC),
ip,
vni: Vni::new(BOUNDARY_SERVICES_VNI).unwrap(),
}
}
None => return Ok(AllowOrDeny::Deny),
}
}
Expand Down Expand Up @@ -519,8 +535,8 @@ pub struct Virt2Phys {
pub struct Virt2Boundary {
// The BTreeMap-based representation of the v2b table is a representation
// that is easily updated.
ip4: KMutex<BTreeMap<Ipv4Cidr, TunnelEndpoint>>,
ip6: KMutex<BTreeMap<Ipv6Cidr, TunnelEndpoint>>,
ip4: KMutex<BTreeMap<Ipv4Cidr, Vec<TunnelEndpoint>>>,
ip6: KMutex<BTreeMap<Ipv6Cidr, Vec<TunnelEndpoint>>>,

// The Poptrie-based representation of the v2b table is a data structure
// optimized for fast query times. It's not easily updated in-place. It's
Expand All @@ -533,26 +549,26 @@ pub struct Virt2Boundary {
// The poptrie is under an read-write lock to allow multiple concurrent
// readers. When we update we hold the lock just long enough to do a swap
// with a poptrie that was pre-built out of band.
pt4: KRwLock<Poptrie<TunnelEndpoint>>,
pt6: KRwLock<Poptrie<TunnelEndpoint>>,
pt4: KRwLock<Poptrie<Vec<TunnelEndpoint>>>,
pt6: KRwLock<Poptrie<Vec<TunnelEndpoint>>>,
}

pub const BOUNDARY_SERVICES_VNI: u32 = 99u32;
pub const INTERNET_GATEWAY_MAC: [u8; 6] = [0xA8, 0x40, 0x25, 0x77, 0x77, 0x77];

impl Virt2Boundary {
pub fn dump_ip4(&self) -> Vec<(Ipv4Cidr, TunnelEndpoint)> {
pub fn dump_ip4(&self) -> Vec<(Ipv4Cidr, Vec<TunnelEndpoint>)> {
let mut ip4 = Vec::new();
for (vip, baddr) in self.ip4.lock().iter() {
ip4.push((*vip, *baddr));
for (vip, baddrs) in self.ip4.lock().iter() {
ip4.push((*vip, baddrs.clone()));
}
ip4
}

pub fn dump_ip6(&self) -> Vec<(Ipv6Cidr, TunnelEndpoint)> {
pub fn dump_ip6(&self) -> Vec<(Ipv6Cidr, Vec<TunnelEndpoint>)> {
let mut ip6 = Vec::new();
for (vip, baddr) in self.ip6.lock().iter() {
ip6.push((*vip, *baddr));
for (vip, baddrs) in self.ip6.lock().iter() {
ip6.push((*vip, baddrs.clone()));
}
ip6
}
Expand Down Expand Up @@ -592,14 +608,14 @@ impl ResourceEntry for PhysNet {}
// are IPs. The mapping resource trait requires that the keys and query
// arguments be of the same type.
impl Virt2Boundary {
pub fn get(&self, vip: &IpAddr) -> Option<TunnelEndpoint> {
pub fn get(&self, vip: &IpAddr) -> Option<Vec<TunnelEndpoint>> {
match vip {
IpAddr::Ip4(ip4) => self.pt4.read().match_v4(u32::from(*ip4)),
IpAddr::Ip6(ip6) => self.pt6.read().match_v6(u128::from(*ip6)),
}
}

pub fn remove(&self, vip: &IpCidr) -> Option<TunnelEndpoint> {
pub fn remove(&self, vip: &IpCidr) -> Option<Vec<TunnelEndpoint>> {
match vip {
IpCidr::Ip4(ip4) => {
let e = self.ip4.lock().remove(ip4);
Expand All @@ -617,8 +633,8 @@ impl Virt2Boundary {
pub fn set(
&self,
vip: IpCidr,
tep: TunnelEndpoint,
) -> Option<TunnelEndpoint> {
tep: Vec<TunnelEndpoint>,
) -> Option<Vec<TunnelEndpoint>> {
match vip {
IpCidr::Ip4(ip4) => {
let e = self.ip4.lock().insert(ip4, tep);
Expand All @@ -638,7 +654,7 @@ impl Virt2Boundary {
self.ip4
.lock()
.iter()
.map(|(k, v)| ((u32::from(k.ip()), k.prefix_len()), *v))
.map(|(k, v)| ((u32::from(k.ip()), k.prefix_len()), v.clone()))
.collect(),
);
*self.pt4.write() = poptrie::Poptrie::from(table);
Expand All @@ -649,7 +665,7 @@ impl Virt2Boundary {
self.ip6
.lock()
.iter()
.map(|(k, v)| ((u128::from(k.ip()), k.prefix_len()), *v))
.map(|(k, v)| ((u128::from(k.ip()), k.prefix_len()), v.clone()))
.collect(),
);
*self.pt6.write() = poptrie::Poptrie::from(table);
Expand Down Expand Up @@ -746,8 +762,8 @@ impl CmdOk for DumpVirt2PhysResp {}

#[derive(Debug, Deserialize, Serialize)]
pub struct V2bMapResp {
pub ip4: Vec<(Ipv4Cidr, TunnelEndpoint)>,
pub ip6: Vec<(Ipv6Cidr, TunnelEndpoint)>,
pub ip4: Vec<(Ipv4Cidr, Vec<TunnelEndpoint>)>,
pub ip6: Vec<(Ipv6Cidr, Vec<TunnelEndpoint>)>,
}

#[derive(Debug, Deserialize, Serialize)]
Expand Down
8 changes: 6 additions & 2 deletions lib/oxide-vpc/src/engine/print.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,18 @@ pub fn print_v2b(resp: &DumpVirt2BoundaryResp) {
print_hr();
print_v2b_header();
for x in &resp.mappings.ip4 {
print_v2b_entry(x.0.into(), x.1.ip, x.1.vni);
for tep in &x.1 {
print_v2b_entry(x.0.into(), tep.ip, tep.vni);
}
}
println!();
println!("IPv6 mappings");
print_hr();
print_v2b_header();
for x in &resp.mappings.ip6 {
print_v2b_entry(x.0.into(), x.1.ip, x.1.vni);
for tep in &x.1 {
print_v2b_entry(x.0.into(), tep.ip, tep.vni);
}
}
println!();
}
Expand Down
8 changes: 4 additions & 4 deletions lib/oxide-vpc/tests/common/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -307,17 +307,17 @@ pub fn oxide_net_setup2(
let v2b = Arc::new(Virt2Boundary::new());
v2b.set(
"0.0.0.0/0".parse().unwrap(),
TunnelEndpoint {
vec![TunnelEndpoint {
ip: "fd00:9900::1".parse().unwrap(),
vni: Vni::new(99u32).unwrap(),
},
}],
);
v2b.set(
"::/0".parse().unwrap(),
TunnelEndpoint {
vec![TunnelEndpoint {
ip: "fd00:9900::1".parse().unwrap(),
vni: Vni::new(99u32).unwrap(),
},
}],
);

let port =
Expand Down
50 changes: 2 additions & 48 deletions xde/src/xde.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ use core::ffi::CStr;
use core::num::NonZeroU32;
use core::ptr;
use core::time::Duration;
use crc32fast::Hasher;
use illumos_sys_hdrs::*;
use opte::api::CmdOk;
use opte::api::Direction;
Expand All @@ -58,8 +57,6 @@ use opte::engine::ether::EtherAddr;
use opte::engine::geneve::Vni;
use opte::engine::headers::EncapMeta;
use opte::engine::headers::IpAddr;
use opte::engine::headers::IpMeta;
use opte::engine::headers::UlpMeta;
use opte::engine::ioctl::{self as api};
use opte::engine::ip6::Ipv6Addr;
use opte::engine::packet::Initialized;
Expand Down Expand Up @@ -107,11 +104,6 @@ const XDE_STR: *const c_char = b"xde\0".as_ptr() as *const c_char;
/// Name of the control device.
const XDE_CTL_STR: *const c_char = b"ctl\0".as_ptr() as *const c_char;

//TODO make configurable
/// The boundary services prefix fd00:99::
const BOUNDARY_SERVICES_PREFIX: u128 =
0xfd00_0099_0000_0000_0000_0000_0000_0000u128;

/// Minor number for the control device.
// Set once in `xde_attach`.
static mut XDE_CTL_MINOR: minor_t = 0;
Expand Down Expand Up @@ -288,32 +280,6 @@ struct XdeDev {
u2: Arc<xde_underlay_port>,
}

macro_rules! l4_hash {
($pkt_meta:expr, $ip_meta:expr) => {
match $pkt_meta.inner.ulp {
Some(ulp) => {
let (src, dst) = match ulp {
UlpMeta::Tcp(t) => (t.src, t.dst),
UlpMeta::Udp(u) => (u.src, u.dst),
UlpMeta::Icmpv4(_) => (0, 0), //TODO use icmp id
UlpMeta::Icmpv6(_) => (0, 0), //TODO use icmp id
};
let mut h = Hasher::new();
h.update(&$ip_meta.src.bytes());
h.update(&$ip_meta.dst.bytes());
h.update(&[u8::from($ip_meta.proto)]);
h.update(&src.to_be_bytes());
h.update(&dst.to_be_bytes());
h.finalize()
}
None => {
opte::engine::dbg("packet with no ulp, dropping");
return ptr::null_mut();
}
}
};
}

#[cfg(not(test))]
#[no_mangle]
unsafe extern "C" fn _init() -> c_int {
Expand Down Expand Up @@ -1545,26 +1511,14 @@ unsafe extern "C" fn xde_mc_tx(
return guest_loopback(src_dev, pkt, vni);
}

let hash = if ip6.dst.has_prefix(BOUNDARY_SERVICES_PREFIX, 32) {
match meta.inner.ip {
Some(IpMeta::Ip4(m)) => Some(l4_hash!(meta, m)),
Some(IpMeta::Ip6(m)) => Some(l4_hash!(meta, m)),
None => {
opte::engine::dbg("packet with no inner ip, dropping");
return ptr::null_mut();
}
}
} else {
None
};

// Currently the overlay layer leaves the outer frame
// destination and source zero'd. Ask IRE for the route
// associated with the underlay destination. Then ask NCE
// for the mac associated with the IRE nexthop to fill in
// the outer frame of the packet. Also return the underlay
// device associated with the nexthop
let (src, dst, underlay_dev) = next_hop(&ip6.dst, src_dev, hash);
let (src, dst, underlay_dev) =
next_hop(&ip6.dst, src_dev, meta.l4_hash());

// Get a pointer to the beginning of the outer frame and
// fill in the dst/src addresses before sending out the
Expand Down

0 comments on commit 8c6326c

Please sign in to comment.