diff --git a/Cargo.lock b/Cargo.lock index 2ae05340..b4d422a3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -979,7 +979,7 @@ checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" [[package]] name = "poptrie" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/poptrie#11c9ce88bfb950bdac1b5016e37c6aedc5ffd05b" +source = "git+https://github.com/oxidecomputer/poptrie?branch=multipath#ca52bef3f87ff1a67d81b3c6e601dcb5fdbcc165" [[package]] name = "postcard" diff --git a/Cargo.toml b/Cargo.toml index 173ea703..14b2818a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -60,7 +60,7 @@ version_check = "0.9" zerocopy = { version = "0.7", features = ["derive"] } zone = { git = "https://github.com/oxidecomputer/zone" } ztest = { git = "https://github.com/oxidecomputer/falcon", branch = "main" } -poptrie = { git = "https://github.com/oxidecomputer/poptrie" } +poptrie = { git = "https://github.com/oxidecomputer/poptrie", branch = "multipath" } [profile.release] debug = 2 diff --git a/bin/opteadm/src/bin/opteadm.rs b/bin/opteadm/src/bin/opteadm.rs index 1e279d83..75fc9258 100644 --- a/bin/opteadm/src/bin/opteadm.rs +++ b/bin/opteadm/src/bin/opteadm.rs @@ -198,7 +198,7 @@ enum Command { SetV2P { vpc_ip: IpAddr, vpc_mac: MacAddr, underlay_ip: Ipv6Addr, vni: Vni }, /// Set a virtual-to-boundary mapping - SetV2B { prefix: IpCidr, tunnel_endpoint: Ipv6Addr }, + SetV2B { prefix: IpCidr, tunnel_endpoint: Vec }, /// Clear a virtual-to-boundary mapping ClearV2B { vpc_ip: IpCidr }, @@ -623,10 +623,10 @@ fn main() -> anyhow::Result<()> { Command::SetV2B { prefix, tunnel_endpoint } => { let hdl = opteadm::OpteAdm::open(OpteAdm::XDE_CTL)?; - let tep = TunnelEndpoint { - ip: tunnel_endpoint, - vni: Vni::new(99u32).unwrap(), - }; + let tep = tunnel_endpoint + .into_iter() + .map(|ip| TunnelEndpoint { ip, vni: Vni::new(99u32).unwrap() }) + .collect(); let req = SetVirt2BoundaryReq { vip: prefix, tep }; hdl.set_v2b(&req)?; } diff --git a/lib/opte/src/engine/packet.rs b/lib/opte/src/engine/packet.rs index 748ffb17..646942a8 100644 --- a/lib/opte/src/engine/packet.rs +++ b/lib/opte/src/engine/packet.rs @@ -49,6 +49,7 @@ use core::fmt::Display; use core::ptr; use core::result; use core::slice; +use crc32fast::Hasher; use dyn_clone::DynClone; use serde::Deserialize; use serde::Serialize; @@ -325,6 +326,36 @@ impl PacketMeta { _ => None, } } + + pub fn l4_hash(&self) -> Option { + let ulp = match self.inner.ulp { + Some(ulp) => ulp, + None => return None, + }; + let mut h = Hasher::new(); + match &self.inner.ip { + Some(IpMeta::Ip4(m)) => { + h.update(&m.src.bytes()); + h.update(&m.dst.bytes()); + h.update(&[u8::from(m.proto)]); + } + Some(IpMeta::Ip6(m)) => { + h.update(&m.src.bytes()); + h.update(&m.dst.bytes()); + h.update(&[u8::from(m.proto)]); + } + None => return None, + }; + let (src, dst) = match ulp { + UlpMeta::Tcp(t) => (t.src, t.dst), + UlpMeta::Udp(u) => (u.src, u.dst), + UlpMeta::Icmpv4(_) => (0, 0), //TODO use icmp id + UlpMeta::Icmpv6(_) => (0, 0), //TODO use icmp id + }; + h.update(&src.to_be_bytes()); + h.update(&dst.to_be_bytes()); + Some(h.finalize()) + } } /// A network packet. diff --git a/lib/oxide-vpc/src/api.rs b/lib/oxide-vpc/src/api.rs index 8b31fb64..727f734a 100644 --- a/lib/oxide-vpc/src/api.rs +++ b/lib/oxide-vpc/src/api.rs @@ -464,7 +464,7 @@ pub struct SetVirt2PhysReq { #[derive(Clone, Debug, Deserialize, Serialize)] pub struct SetVirt2BoundaryReq { pub vip: IpCidr, - pub tep: TunnelEndpoint, + pub tep: Vec, } /// Clear a mapping from VPC IP to a boundary tunnel endpoint destination. diff --git a/lib/oxide-vpc/src/engine/overlay.rs b/lib/oxide-vpc/src/engine/overlay.rs index 34ec5a96..1d724639 100644 --- a/lib/oxide-vpc/src/engine/overlay.rs +++ b/lib/oxide-vpc/src/engine/overlay.rs @@ -202,7 +202,7 @@ impl StaticAction for EncapAction { // The encap action is only used for outgoing. _dir: Direction, flow_id: &InnerFlowId, - _pkt_meta: &PacketMeta, + pkt_meta: &PacketMeta, action_meta: &mut ActionMeta, ) -> GenHtResult { // The router layer determines a RouterTarget and stores it in @@ -236,11 +236,27 @@ impl StaticAction for EncapAction { let phys_target = match target { RouterTargetInternal::InternetGateway => { match self.v2b.get(&flow_id.dst_ip) { - Some(phys) => PhysNet { - ether: MacAddr::from(INTERNET_GATEWAY_MAC), - ip: phys.ip, - vni: Vni::new(BOUNDARY_SERVICES_VNI).unwrap(), - }, + Some(phys) => { + // Hash the packet onto a route target. This is a very + // rudimentary mechanism. Should level-up to an ECMP + // algorithm with well known statistical properties. + let hash = match pkt_meta.l4_hash() { + Some(h) => h, + None => { + return Err(GenHtError::Unexpected { + msg: "could not compute l4 hash for packet" + .to_string(), + }); + } + }; + let hash = hash as usize; + let ip = phys[hash % phys.len()].ip; + PhysNet { + ether: MacAddr::from(INTERNET_GATEWAY_MAC), + ip, + vni: Vni::new(BOUNDARY_SERVICES_VNI).unwrap(), + } + } None => return Ok(AllowOrDeny::Deny), } } @@ -519,8 +535,8 @@ pub struct Virt2Phys { pub struct Virt2Boundary { // The BTreeMap-based representation of the v2b table is a representation // that is easily updated. - ip4: KMutex>, - ip6: KMutex>, + ip4: KMutex>>, + ip6: KMutex>>, // The Poptrie-based representation of the v2b table is a data structure // optimized for fast query times. It's not easily updated in-place. It's @@ -533,26 +549,26 @@ pub struct Virt2Boundary { // The poptrie is under an read-write lock to allow multiple concurrent // readers. When we update we hold the lock just long enough to do a swap // with a poptrie that was pre-built out of band. - pt4: KRwLock>, - pt6: KRwLock>, + pt4: KRwLock>>, + pt6: KRwLock>>, } pub const BOUNDARY_SERVICES_VNI: u32 = 99u32; pub const INTERNET_GATEWAY_MAC: [u8; 6] = [0xA8, 0x40, 0x25, 0x77, 0x77, 0x77]; impl Virt2Boundary { - pub fn dump_ip4(&self) -> Vec<(Ipv4Cidr, TunnelEndpoint)> { + pub fn dump_ip4(&self) -> Vec<(Ipv4Cidr, Vec)> { let mut ip4 = Vec::new(); - for (vip, baddr) in self.ip4.lock().iter() { - ip4.push((*vip, *baddr)); + for (vip, baddrs) in self.ip4.lock().iter() { + ip4.push((*vip, baddrs.clone())); } ip4 } - pub fn dump_ip6(&self) -> Vec<(Ipv6Cidr, TunnelEndpoint)> { + pub fn dump_ip6(&self) -> Vec<(Ipv6Cidr, Vec)> { let mut ip6 = Vec::new(); - for (vip, baddr) in self.ip6.lock().iter() { - ip6.push((*vip, *baddr)); + for (vip, baddrs) in self.ip6.lock().iter() { + ip6.push((*vip, baddrs.clone())); } ip6 } @@ -592,14 +608,14 @@ impl ResourceEntry for PhysNet {} // are IPs. The mapping resource trait requires that the keys and query // arguments be of the same type. impl Virt2Boundary { - pub fn get(&self, vip: &IpAddr) -> Option { + pub fn get(&self, vip: &IpAddr) -> Option> { match vip { IpAddr::Ip4(ip4) => self.pt4.read().match_v4(u32::from(*ip4)), IpAddr::Ip6(ip6) => self.pt6.read().match_v6(u128::from(*ip6)), } } - pub fn remove(&self, vip: &IpCidr) -> Option { + pub fn remove(&self, vip: &IpCidr) -> Option> { match vip { IpCidr::Ip4(ip4) => { let e = self.ip4.lock().remove(ip4); @@ -617,8 +633,8 @@ impl Virt2Boundary { pub fn set( &self, vip: IpCidr, - tep: TunnelEndpoint, - ) -> Option { + tep: Vec, + ) -> Option> { match vip { IpCidr::Ip4(ip4) => { let e = self.ip4.lock().insert(ip4, tep); @@ -638,7 +654,7 @@ impl Virt2Boundary { self.ip4 .lock() .iter() - .map(|(k, v)| ((u32::from(k.ip()), k.prefix_len()), *v)) + .map(|(k, v)| ((u32::from(k.ip()), k.prefix_len()), v.clone())) .collect(), ); *self.pt4.write() = poptrie::Poptrie::from(table); @@ -649,7 +665,7 @@ impl Virt2Boundary { self.ip6 .lock() .iter() - .map(|(k, v)| ((u128::from(k.ip()), k.prefix_len()), *v)) + .map(|(k, v)| ((u128::from(k.ip()), k.prefix_len()), v.clone())) .collect(), ); *self.pt6.write() = poptrie::Poptrie::from(table); @@ -746,8 +762,8 @@ impl CmdOk for DumpVirt2PhysResp {} #[derive(Debug, Deserialize, Serialize)] pub struct V2bMapResp { - pub ip4: Vec<(Ipv4Cidr, TunnelEndpoint)>, - pub ip6: Vec<(Ipv6Cidr, TunnelEndpoint)>, + pub ip4: Vec<(Ipv4Cidr, Vec)>, + pub ip6: Vec<(Ipv6Cidr, Vec)>, } #[derive(Debug, Deserialize, Serialize)] diff --git a/lib/oxide-vpc/src/engine/print.rs b/lib/oxide-vpc/src/engine/print.rs index 7762caaf..d14d17e3 100644 --- a/lib/oxide-vpc/src/engine/print.rs +++ b/lib/oxide-vpc/src/engine/print.rs @@ -75,14 +75,18 @@ pub fn print_v2b(resp: &DumpVirt2BoundaryResp) { print_hr(); print_v2b_header(); for x in &resp.mappings.ip4 { - print_v2b_entry(x.0.into(), x.1.ip, x.1.vni); + for tep in &x.1 { + print_v2b_entry(x.0.into(), tep.ip, tep.vni); + } } println!(); println!("IPv6 mappings"); print_hr(); print_v2b_header(); for x in &resp.mappings.ip6 { - print_v2b_entry(x.0.into(), x.1.ip, x.1.vni); + for tep in &x.1 { + print_v2b_entry(x.0.into(), tep.ip, tep.vni); + } } println!(); } diff --git a/lib/oxide-vpc/tests/common/mod.rs b/lib/oxide-vpc/tests/common/mod.rs index dd07b1dd..c97d2d53 100644 --- a/lib/oxide-vpc/tests/common/mod.rs +++ b/lib/oxide-vpc/tests/common/mod.rs @@ -307,17 +307,17 @@ pub fn oxide_net_setup2( let v2b = Arc::new(Virt2Boundary::new()); v2b.set( "0.0.0.0/0".parse().unwrap(), - TunnelEndpoint { + vec![TunnelEndpoint { ip: "fd00:9900::1".parse().unwrap(), vni: Vni::new(99u32).unwrap(), - }, + }], ); v2b.set( "::/0".parse().unwrap(), - TunnelEndpoint { + vec![TunnelEndpoint { ip: "fd00:9900::1".parse().unwrap(), vni: Vni::new(99u32).unwrap(), - }, + }], ); let port = diff --git a/xde/src/xde.rs b/xde/src/xde.rs index 110dfe04..5ef91582 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -38,7 +38,6 @@ use core::ffi::CStr; use core::num::NonZeroU32; use core::ptr; use core::time::Duration; -use crc32fast::Hasher; use illumos_sys_hdrs::*; use opte::api::CmdOk; use opte::api::Direction; @@ -58,8 +57,6 @@ use opte::engine::ether::EtherAddr; use opte::engine::geneve::Vni; use opte::engine::headers::EncapMeta; use opte::engine::headers::IpAddr; -use opte::engine::headers::IpMeta; -use opte::engine::headers::UlpMeta; use opte::engine::ioctl::{self as api}; use opte::engine::ip6::Ipv6Addr; use opte::engine::packet::Initialized; @@ -107,11 +104,6 @@ const XDE_STR: *const c_char = b"xde\0".as_ptr() as *const c_char; /// Name of the control device. const XDE_CTL_STR: *const c_char = b"ctl\0".as_ptr() as *const c_char; -//TODO make configurable -/// The boundary services prefix fd00:99:: -const BOUNDARY_SERVICES_PREFIX: u128 = - 0xfd00_0099_0000_0000_0000_0000_0000_0000u128; - /// Minor number for the control device. // Set once in `xde_attach`. static mut XDE_CTL_MINOR: minor_t = 0; @@ -288,32 +280,6 @@ struct XdeDev { u2: Arc, } -macro_rules! l4_hash { - ($pkt_meta:expr, $ip_meta:expr) => { - match $pkt_meta.inner.ulp { - Some(ulp) => { - let (src, dst) = match ulp { - UlpMeta::Tcp(t) => (t.src, t.dst), - UlpMeta::Udp(u) => (u.src, u.dst), - UlpMeta::Icmpv4(_) => (0, 0), //TODO use icmp id - UlpMeta::Icmpv6(_) => (0, 0), //TODO use icmp id - }; - let mut h = Hasher::new(); - h.update(&$ip_meta.src.bytes()); - h.update(&$ip_meta.dst.bytes()); - h.update(&[u8::from($ip_meta.proto)]); - h.update(&src.to_be_bytes()); - h.update(&dst.to_be_bytes()); - h.finalize() - } - None => { - opte::engine::dbg("packet with no ulp, dropping"); - return ptr::null_mut(); - } - } - }; -} - #[cfg(not(test))] #[no_mangle] unsafe extern "C" fn _init() -> c_int { @@ -1545,26 +1511,14 @@ unsafe extern "C" fn xde_mc_tx( return guest_loopback(src_dev, pkt, vni); } - let hash = if ip6.dst.has_prefix(BOUNDARY_SERVICES_PREFIX, 32) { - match meta.inner.ip { - Some(IpMeta::Ip4(m)) => Some(l4_hash!(meta, m)), - Some(IpMeta::Ip6(m)) => Some(l4_hash!(meta, m)), - None => { - opte::engine::dbg("packet with no inner ip, dropping"); - return ptr::null_mut(); - } - } - } else { - None - }; - // Currently the overlay layer leaves the outer frame // destination and source zero'd. Ask IRE for the route // associated with the underlay destination. Then ask NCE // for the mac associated with the IRE nexthop to fill in // the outer frame of the packet. Also return the underlay // device associated with the nexthop - let (src, dst, underlay_dev) = next_hop(&ip6.dst, src_dev, hash); + let (src, dst, underlay_dev) = + next_hop(&ip6.dst, src_dev, meta.l4_hash()); // Get a pointer to the beginning of the outer frame and // fill in the dst/src addresses before sending out the