Eventual-Inc · kevinzwang · Jan 29, 2025 · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025
diff --git a/daft/daft/__init__.pyi b/daft/daft/__init__.pyi
@@ -1455,8 +1455,8 @@ class PyMicroPartition:
         right: PyMicroPartition,
         left_on: list[PyExpr],
         right_on: list[PyExpr],
-        null_equals_nulls: list[bool] | None,
         how: JoinType,
+        null_equals_nulls: list[bool] | None = None,
     ) -> PyMicroPartition: ...
     def pivot(
         self,
@@ -1584,6 +1584,11 @@ class AdaptivePhysicalPlanScheduler:
         num_rows: int,
     ) -> None: ...
 
+class JoinColumnRenamingParams:
+    def __new__(
+        cls, prefix: str | None, suffix: str | None, merge_matching_join_keys: bool
+    ) -> JoinColumnRenamingParams: ...
+
 class LogicalPlanBuilder:
     """A logical plan builder, which simplifies constructing logical plans via a fluent interface.
 
@@ -1642,9 +1647,8 @@ class LogicalPlanBuilder:
         left_on: list[PyExpr],
         right_on: list[PyExpr],
         join_type: JoinType,
-        strategy: JoinStrategy | None = None,
-        join_prefix: str | None = None,
-        join_suffix: str | None = None,
+        join_strategy: JoinStrategy | None = None,
+        column_renaming_params: JoinColumnRenamingParams | None = None,
     ) -> LogicalPlanBuilder: ...
     def concat(self, other: LogicalPlanBuilder) -> LogicalPlanBuilder: ...
     def intersect(self, other: LogicalPlanBuilder, is_all: bool) -> LogicalPlanBuilder: ...

diff --git a/daft/dataframe/dataframe.py b/daft/dataframe/dataframe.py
@@ -32,7 +32,7 @@
 from daft.api_annotations import DataframePublicAPI
 from daft.context import get_context
 from daft.convert import InputListType
-from daft.daft import FileFormat, IOConfig, JoinStrategy, JoinType, check_column_name_validity
+from daft.daft import FileFormat, IOConfig, JoinColumnRenamingParams, JoinStrategy, JoinType, check_column_name_validity
 from daft.dataframe.preview import DataFramePreview
 from daft.datatype import DataType
 from daft.errors import ExpressionTypeError
@@ -1897,8 +1897,7 @@ def join(
             right_on=right_exprs,
             how=join_type,
             strategy=join_strategy,
-            join_prefix=prefix,
-            join_suffix=suffix,
+            column_renaming_params=JoinColumnRenamingParams(prefix, suffix, True),
         )
         return DataFrame(builder)
 

diff --git a/daft/logical/builder.py b/daft/logical/builder.py
@@ -8,6 +8,7 @@
     CountMode,
     FileFormat,
     IOConfig,
+    JoinColumnRenamingParams,
     JoinStrategy,
     JoinType,
     PyDaftExecutionConfig,
@@ -257,17 +258,15 @@ def join(  # type: ignore[override]
         right_on: list[Expression],
         how: JoinType = JoinType.Inner,
         strategy: JoinStrategy | None = None,
-        join_suffix: str | None = None,
-        join_prefix: str | None = None,
+        column_renaming_params: JoinColumnRenamingParams | None = None,
     ) -> LogicalPlanBuilder:
         builder = self._builder.join(
             right._builder,
             [expr._expr for expr in left_on],
             [expr._expr for expr in right_on],
             how,
             strategy,
-            join_suffix,
-            join_prefix,
+            column_renaming_params,
         )
         return LogicalPlanBuilder(builder)
 

diff --git a/src/arrow2/src/array/dyn_ord.rs b/src/arrow2/src/array/dyn_ord.rs
@@ -1,19 +1,16 @@
+use std::cmp::Ordering;
+
 use num_traits::Float;
 use ord::total_cmp;
 
-use std::cmp::Ordering;
-
-use crate::datatypes::*;
-use crate::error::Error;
-use crate::offset::Offset;
-use crate::{array::*, types::NativeType};
+use crate::{array::*, datatypes::*, error::Error, offset::Offset, types::NativeType};
 
 /// Compare the values at two arbitrary indices in two arbitrary arrays.
 pub type DynArrayComparator =
     Box<dyn Fn(&dyn Array, &dyn Array, usize, usize) -> Ordering + Send + Sync>;
 
 #[inline]
-unsafe fn is_valid<A: Array>(arr: &A, i: usize) -> bool {
+unsafe fn is_valid(arr: &dyn Array, i: usize) -> bool {
     // avoid dyn function hop by using generic
     arr.validity()
         .as_ref()
@@ -22,9 +19,9 @@ unsafe fn is_valid<A: Array>(arr: &A, i: usize) -> bool {
 }
 
 #[inline]
-fn compare_with_nulls<A: Array, F: FnOnce() -> Ordering>(
-    left: &A,
-    right: &A,
+fn compare_with_nulls<F: FnOnce() -> Ordering>(
+    left: &dyn Array,
+    right: &dyn Array,
     i: usize,
     j: usize,
     nulls_equal: bool,
@@ -122,6 +119,30 @@ fn compare_dyn_boolean(nulls_equal: bool) -> DynArrayComparator {
     })
 }
 
+fn compare_dyn_null(nulls_equal: bool) -> DynArrayComparator {
+    Box::new(move |left, right, i, j| {
+        assert!(i < left.len());
+        assert!(j < right.len());
+        // need the extra datatype check in match because the validity of a null array
+        // is quizzically always true and not false
+        match (
+            unsafe { is_valid(left, i) } && *left.data_type() != DataType::Null,
+            unsafe { is_valid(right, j) } && *right.data_type() != DataType::Null,
+        ) {
+            (true, true) => unreachable!(),
+            (false, true) => Ordering::Greater,
+            (true, false) => Ordering::Less,
+            (false, false) => {
+                if nulls_equal {
+                    Ordering::Equal
+                } else {
+                    Ordering::Less
+                }
+            }
+        }
+    })
+}
+
 pub fn build_dyn_array_compare(
     left: &DataType,
     right: &DataType,
@@ -187,6 +208,7 @@ pub fn build_dyn_array_compare(
         //         }
         //     }
         // }
+        (Null, _) | (_, Null) => compare_dyn_null(nulls_equal),
         (lhs, _) => {
             return Err(Error::InvalidArgumentError(format!(
                 "The data type type {lhs:?} has no natural order"

diff --git a/src/arrow2/src/array/ord.rs b/src/arrow2/src/array/ord.rs
@@ -2,10 +2,7 @@
 
 use std::cmp::Ordering;
 
-use crate::datatypes::*;
-use crate::error::Error;
-use crate::offset::Offset;
-use crate::{array::*, types::NativeType};
+use crate::{array::*, datatypes::*, error::Error, offset::Offset, types::NativeType};
 
 /// Compare the values at two arbitrary indices in two arrays.
 pub type DynComparator = Box<dyn Fn(usize, usize) -> Ordering + Send + Sync>;
@@ -157,6 +154,14 @@ macro_rules! dyn_dict {
     }};
 }
 
+fn compare_null(_left: &dyn Array, _right: &dyn Array) -> DynComparator {
+    Box::new(move |_i: usize, _j: usize| {
+        // nulls do not have a canonical ordering, but it is trivially implemented so that
+        // null arrays can be used in things that depend on `build_compare`
+        Ordering::Less
+    })
+}
+
 /// returns a comparison function that compares values at two different slots
 /// between two [`Array`].
 /// # Example
@@ -243,6 +248,7 @@ pub fn build_compare(left: &dyn Array, right: &dyn Array) -> Result<DynComparato
                 }
             }
         }
+        (Null, _) | (_, Null) => compare_null(left, right),
         (lhs, _) => {
             return Err(Error::InvalidArgumentError(format!(
                 "The data type type {lhs:?} has no natural order"

diff --git a/src/daft-dsl/src/expr/mod.rs b/src/daft-dsl/src/expr/mod.rs
@@ -3,6 +3,7 @@
 
 use std::{
     any::Any,
+    collections::HashSet,
     hash::{DefaultHasher, Hash, Hasher},
     io::{self, Write},
     str::FromStr,
@@ -21,7 +22,6 @@
     utils::supertype::try_get_supertype,
 };
 use derive_more::Display;
-use itertools::Itertools;
 use serde::{Deserialize, Serialize};
 
 use super::functions::FunctionExpr;
@@ -1320,9 +1320,9 @@
 // Check if one set of columns is a reordering of the other
 pub fn is_partition_compatible(a: &[ExprRef], b: &[ExprRef]) -> bool {
     // sort a and b by name
-    let a: Vec<&str> = a.iter().map(|a| a.name()).sorted().collect();
-    let b: Vec<&str> = b.iter().map(|a| a.name()).sorted().collect();
-    a == b
+    let a_set: HashSet<&ExprRef> = HashSet::from_iter(a);
+    let b_set: HashSet<&ExprRef> = HashSet::from_iter(b);
+    a_set == b_set
 }
 
 pub fn has_agg(expr: &ExprRef) -> bool {
@@ -1443,3 +1443,31 @@
         .collect::<DaftResult<_>>()?;
     Ok(Arc::new(Schema::new(fields)?))
 }
+
+/// Adds aliases as appropriate to ensure that all expressions have unique names.
+pub fn deduplicate_expr_names(exprs: &[ExprRef]) -> Vec<ExprRef> {
+    let mut names_so_far = HashSet::new();
+
+    exprs
+        .iter()
+        .map(|e| {
+            let curr_name = e.name();
+
+            let mut i = 0;
+            let mut new_name = curr_name.to_string();
+
+            while names_so_far.contains(&new_name) {
+                i += 1;
+                new_name = format!("{}_{}", curr_name, i);
+            }
+
+            names_so_far.insert(new_name.clone());
+
+            if i == 0 {
+                e.clone()
+            } else {
+                e.alias(new_name)
+            }
+        })
+        .collect()
+}
diff --git a/src/daft-dsl/src/join.rs b/src/daft-dsl/src/join.rs
@@ -0,0 +1,99 @@
+use common_error::DaftResult;
+use daft_core::{prelude::*, utils::supertype::try_get_supertype};
+use indexmap::IndexSet;
+
+use crate::{deduplicate_expr_names, ExprRef};
+
+pub fn get_common_join_cols<'a>(
+    left_schema: &'a SchemaRef,
+    right_schema: &'a SchemaRef,
+) -> impl Iterator<Item = &'a String> {
+    left_schema
+        .fields
+        .keys()
+        .filter(|name| right_schema.has_field(name))
+}
+
+/// Infer the schema of a join operation
+pub fn infer_join_schema(
+    left_schema: &SchemaRef,
+    right_schema: &SchemaRef,
+    join_type: JoinType,
+) -> DaftResult<SchemaRef> {
+    if matches!(join_type, JoinType::Anti | JoinType::Semi) {
+        Ok(left_schema.clone())
+    } else {
+        let common_cols = get_common_join_cols(left_schema, right_schema).collect::<IndexSet<_>>();
+
+        // common columns, then unique left fields, then unique right fields
+        let fields = common_cols
+            .iter()
+            .map(|name| {
+                let left_field = left_schema.get_field(name).unwrap();
+                let right_field = right_schema.get_field(name).unwrap();
+
+                Ok(match join_type {
+                    JoinType::Inner => left_field.clone(),
+                    JoinType::Left => left_field.clone(),
+                    JoinType::Right => right_field.clone(),
+                    JoinType::Outer => {
+                        let supertype = try_get_supertype(&left_field.dtype, &right_field.dtype)?;
+
+                        Field::new(*name, supertype)
+                    }
+                    JoinType::Anti | JoinType::Semi => unreachable!(),
+                })
+            })
+            .chain(
+                left_schema
+                    .fields
+                    .iter()
+                    .chain(right_schema.fields.iter())
+                    .filter_map(|(name, field)| {
+                        if common_cols.contains(name) {
+                            None
+                        } else {
+                            Some(field.clone())
+                        }
+                    })
+                    .map(Ok),
+            )
+            .collect::<DaftResult<_>>()?;
+
+        Ok(Schema::new(fields)?.into())
+    }
+}
+
+/// Casts join keys to the same types and make their names unique.
+pub fn normalize_join_keys(
+    left_on: Vec<ExprRef>,
+    right_on: Vec<ExprRef>,
+    left_schema: SchemaRef,
+    right_schema: SchemaRef,
+) -> DaftResult<(Vec<ExprRef>, Vec<ExprRef>)> {
+    let (left_on, right_on) = left_on
+        .into_iter()
+        .zip(right_on)
+        .map(|(mut l, mut r)| {
+            let l_dtype = l.to_field(&left_schema)?.dtype;
+            let r_dtype = r.to_field(&right_schema)?.dtype;
+
+            let supertype = try_get_supertype(&l_dtype, &r_dtype)?;
+
+            if l_dtype != supertype {
+                l = l.cast(&supertype);
+            }
+
+            if r_dtype != supertype {
+                r = r.cast(&supertype);
+            }
+
+            Ok((l, r))
+        })
+        .collect::<DaftResult<(Vec<_>, Vec<_>)>>()?;
+
+    let left_on = deduplicate_expr_names(&left_on);
+    let right_on = deduplicate_expr_names(&right_on);
+
+    Ok((left_on, right_on))
+}