Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Throw exception in grouping functions if frame contains missing values #405

Merged
merged 1 commit into from
Aug 7, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 43 additions & 26 deletions src/Deedle/Frame.fs
Original file line number Diff line number Diff line change
Expand Up @@ -1523,28 +1523,32 @@ and Frame<'TRowKey, 'TColumnKey when 'TRowKey : equality and 'TColumnKey : equal

member internal frame.GroupByLabels labels n =
let offsets = [0 .. n-1]

let relocs =
labels
|> Seq.zip offsets // seq of (srcloc, label)
|> Seq.zip frame.RowKeys // seq of (rowkey, (srcloc, label))
|> Seq.groupBy (fun (rk, (i, l)) -> l) // seq of (label, seq of (rowkey, (srcloc, label)))
|> Seq.map (fun (k, s) -> s) // seq of (seq of (rowkey, (srcloc, label)))
|> Seq.concat // seq of (rowkey, (srcloc, label))
|> Seq.zip offsets // seq of (dstloc, (rowkey, (srcloc, label)))
|> Seq.map (fun (dst, (rowkey, (src, grp))) ->
(grp, rowkey), (dst, src)) // seq of (label, rowkey), (dstloc, srcloc)
|> ReadOnlyCollection.ofSeq

let addressify (a, b) = (frame.RowIndex.AddressAt <| int64 a, frame.RowIndex.AddressAt <| int64 b)

let keys = ReadOnlyCollection.map fst relocs
let locs = ReadOnlyCollection.map (snd >> addressify) relocs

let newIndex = Index.ofKeys keys
let cmd = VectorConstruction.Relocate(VectorConstruction.Return 0, int64 n, locs)
let newData = frame.Data.Select(VectorHelpers.transformColumn frame.VectorBuilder newIndex.AddressingScheme cmd)
Frame<_, _>(newIndex, frame.ColumnIndex, newData, frame.IndexBuilder, frame.VectorBuilder)
// check if column with the labels has missing values
if n <> (Seq.length labels) then
failwith "GroupByLabels: Wrong number of labels. \
Make sure that your frame does not contain missing values."
else
let relocs =
labels
|> Seq.zip offsets // seq of (srcloc, label)
|> Seq.zip frame.RowKeys // seq of (rowkey, (srcloc, label))
|> Seq.groupBy (fun (rk, (i, l)) -> l) // seq of (label, seq of (rowkey, (srcloc, label)))
|> Seq.map (fun (k, s) -> s) // seq of (seq of (rowkey, (srcloc, label)))
|> Seq.concat // seq of (rowkey, (srcloc, label))
|> Seq.zip offsets // seq of (dstloc, (rowkey, (srcloc, label)))
|> Seq.map (fun (dst, (rowkey, (src, grp))) ->
(grp, rowkey), (dst, src)) // seq of (label, rowkey), (dstloc, srcloc)
|> ReadOnlyCollection.ofSeq

let addressify (a, b) = (frame.RowIndex.AddressAt <| int64 a, frame.RowIndex.AddressAt <| int64 b)

let keys = ReadOnlyCollection.map fst relocs
let locs = ReadOnlyCollection.map (snd >> addressify) relocs

let newIndex = Index.ofKeys keys
let cmd = VectorConstruction.Relocate(VectorConstruction.Return 0, int64 n, locs)
let newData = frame.Data.Select(VectorHelpers.transformColumn frame.VectorBuilder newIndex.AddressingScheme cmd)
Frame<_, _>(newIndex, frame.ColumnIndex, newData, frame.IndexBuilder, frame.VectorBuilder)

member internal frame.NestRowsBy<'TGroup when 'TGroup : equality>(labels:seq<'TGroup>) =
let indexBuilder = frame.IndexBuilder
Expand Down Expand Up @@ -1575,17 +1579,30 @@ and Frame<'TRowKey, 'TColumnKey when 'TRowKey : equality and 'TColumnKey : equal
Series<_, _>(newIndex, Vector.ofValues groups, vectorBuilder, indexBuilder)

member frame.GroupRowsBy<'TGroup when 'TGroup : equality>(colKey) =
let col = frame.GetColumn<'TGroup>(colKey)
frame.GroupByLabels col.Values frame.RowCount
let col = frame.GetColumn<'TGroup>(colKey)
let labels = col.Values
// check if column with labels has missing values
if frame.RowCount <> (Seq.length labels) then
failwith "GroupRowsBy: Specified column contains missing values and \
cannot be used for grouping. Remove missing values \
first (e.g., by using dropSparseRowsBy)."
else
frame.GroupByLabels labels frame.RowCount

member frame.GroupRowsByIndex(keySelector:Func<_, _>) =
let labels = frame.RowIndex.Keys |> Seq.map keySelector.Invoke
frame.GroupByLabels labels frame.RowCount

member frame.GroupRowsUsing<'TGroup when 'TGroup : equality>(f:System.Func<_, _, 'TGroup>) =
let labels = frame.Rows |> Series.map (fun k v -> f.Invoke(k, v)) |> Series.values
frame.GroupByLabels labels frame.RowCount

// check if column with labels has missing values
if frame.RowCount <> (Seq.length labels) then
failwith "GroupRowsUsing: Generated labels contain missing values and \
cannot be used for grouping. Make sure the projection function does \
not return null or filter out the corresponding rows before grouping."
else
frame.GroupByLabels labels frame.RowCount

/// Returns a data frame whose rows are grouped by `groupBy` and whose columns specified
/// in `aggBy` are aggregated according to `aggFunc`.
///
Expand Down
7 changes: 6 additions & 1 deletion src/Deedle/FrameModule.fs
Original file line number Diff line number Diff line change
Expand Up @@ -1557,7 +1557,12 @@ module Frame =
[<CompiledName("NestBy")>]
let nestBy (keySelector:_ -> 'K1) (frame:Frame<'K2, 'C>) =
let labels = (frame.RowKeys |> Seq.map keySelector)
frame.GroupByLabels labels frame.RowCount |> nest
if frame.RowCount <> (Seq.length labels) then
failwith "nestBy: Generated labels contain missing values and \
cannot be used for grouping. Make sure the keySelector function does \
not return null."
else
frame.GroupByLabels labels frame.RowCount |> nest

/// Given a series of frames, returns a new data frame with two-level hierarchical
/// row index, using the series keys as the first component. This function is the
Expand Down