From b360d87ad02383e37ad53dafa036abfb9e2bbc53 Mon Sep 17 00:00:00 2001 From: Michael Maier Date: Sun, 26 Nov 2023 13:12:36 +0100 Subject: [PATCH] fix: preserve the order of the keys (#211) * fix: preserve the order of the keys Co-authored-by: Rodolfo Carvalho --- README.md | 92 ++++++++- lib/bson/decoder.ex | 391 +++++++++++++++++++++---------------- lib/mongo/messages.ex | 56 ++++-- mix.lock | 1 + test/bson/decoder_test.exs | 115 +++++++++++ 5 files changed, 456 insertions(+), 199 deletions(-) create mode 100644 test/bson/decoder_test.exs diff --git a/README.md b/README.md index d6baf0c7..c6da0e2b 100644 --- a/README.md +++ b/README.md @@ -135,15 +135,18 @@ Mongo.insert_many(top, "users", [ ## Data Representation -This driver chooses to accept both maps and lists of key-value tuples when encoding BSON documents (1), but will only decode documents into maps. This has the side effect that document field order is lost when decoding. Maps are convenient to work with, but map keys are not ordered, unlike BSON document fields. +This driver chooses to accept both maps and lists of key-value tuples when encoding BSON documents (1), but will only +decode documents into maps. Maps are convenient to work with, but Elixir map keys are not ordered, unlike BSON document +keys. -Driver users should represent documents using a list of tuples when field order matters, for example when sorting by multiple fields: +That design decision means document key order is lost when encoding Elixir maps to BSON and, conversely, when decoding +BSON documents to Elixir maps. However, see [Preserve Document Key Order](#preserve-document-key-order) to learn how to +preserve key order when it matters. -```elixir -Mongo.find(top, "users", %{}, sort: [last_name: 1, first_name: 1, _id: 1]) -``` - -Additionally, the driver accepts both atoms and strings for document keys, but will only decode them into strings. Creating atoms from arbitrary input (such as database documents) is [discouraged](https://elixir-lang.org/getting-started/mix-otp/genserver.html#:~:text=However%2C%20naming%20dynamic,our%20system%20memory!) because atoms are not garbage collected. +Additionally, the driver accepts both atoms and strings for document keys, but will only decode them into strings. +Creating atoms from arbitrary input (such as database documents) is +[discouraged](https://elixir-lang.org/getting-started/mix-otp/genserver.html#:~:text=However%2C%20naming%20dynamic,our%20system%20memory!) +because atoms are not garbage collected. [BSON symbols (deprecated)](https://bsonspec.org/spec.html#:~:text=Symbol.%20%E2%80%94%20Deprecated) can only be decoded (2). @@ -169,6 +172,81 @@ Additionally, the driver accepts both atoms and strings for document keys, but w max key :BSON_max decimal128 Decimal{} +## Preserve Document Key Order + +### Encoding from Elixir to BSON + +For some MongoDB operations, the order of the keys in a document affect the result. For example, that is the case when +sorting a query by multiple fields. + +In those cases, driver users should represent documents using a list of tuples (or a keyword list) to preserve the +order. Example: + +```elixir +Mongo.find(top, "users", %{}, sort: [last_name: 1, first_name: 1, _id: 1]) +``` + +The query above will sort users by last name, then by first name and finally by ID. If an Elixir map had been used to +specify `:sort`, query results would end up sorted unexpectedly wrong. + +### Decoding from BSON to Elixir + +Decoded BSON documents are always represented by Elixir maps because the driver depends on that to implement its +functionality. + +If the order of document keys as stored by MongoDB is needed, the driver can be configured to use a BSON decoder module +that puts a list of keys in the original order under the `:__order__` key (and it works recursively). + +```elixir +config :mongodb_driver, + decoder: BSON.PreserveOrderDecoder +``` + +It is possible to customize the key. For example, to use `:original_order` instead of the default `:__order__`: + +```elixir +config :mongodb_driver, + decoder: {BSON.PreserveOrderDecoder, key: :original_order} +``` + +The resulting maps with annotated key order can be recursively transformed into lists of tuples. That allows for +preserving the order again when encoding. Here is an example of how to achieve that: + +```elixir +defmodule MapWithOrder do + def to_list(doc, order_key \\ :__order__) do + do_to_list(doc, order_key) + end + + defp do_to_list(%{__struct__: _} = elem, _order_key) do + elem + end + + defp do_to_list(doc, order_key) when is_map(doc) do + doc + |> Map.get(order_key, Map.keys(doc)) + |> Enum.map(fn key -> {key, do_to_list(Map.get(doc, key), order_key)} end) + end + + defp do_to_list(xs, order_key) when is_list(xs) do + Enum.map(xs, fn elem -> do_to_list(elem, order_key) end) + end + + defp do_to_list(elem, _order_key) do + elem + end +end + +# doc = ... +MapWithOrder.to_list(doc) +``` + +Note that structs are kept as-is, to handle special values such as `BSON.ObjectId`. + +The decoder module is defined at compile time. The default decoder is `BSON.Decoder`, which does not preserve document +key order. As it needs to execute fewer operations when decoding data received from MongoDB, it offers improved +performance. Therefore, the default decoder is recommended for most use cases of this driver. + ## Writing your own encoding info If you want to write a custom struct to your mongo collection - you can do that diff --git a/lib/bson/decoder.ex b/lib/bson/decoder.ex index 36beba3f..5e501204 100644 --- a/lib/bson/decoder.ex +++ b/lib/bson/decoder.ex @@ -1,183 +1,228 @@ -defmodule BSON.Decoder do +defmodule BSON.DecoderGenerator do @moduledoc false - use BSON.Utils - alias BSON.Decimal128 - - def decode(binary) do - {map, ""} = document(binary) - map - end - - def documents(binary), do: documents(binary, []) - def documents("", acc), do: Enum.reverse(acc) - - def documents(binary, acc) do - {doc, rest} = document(binary) - documents(rest, [doc | acc]) - end - - defp type(@type_float, <<0, 0, 0, 0, 0, 0, 240::little-integer-size(8), 127::little-integer-size(8), rest::binary>>) do - {:inf, rest} - end - - defp type(@type_float, <<0, 0, 0, 0, 0, 0, 240::little-integer-size(8), 255::little-integer-size(8), rest::binary>>) do - {:"-inf", rest} - end - - defp type(@type_float, <<0, 0, 0, 0, 0, 0, 248::little-integer-size(8), 127::little-integer-size(8), rest::binary>>) do - {:NaN, rest} - end - - defp type(@type_float, <<1, 0, 0, 0, 0, 0, 240::little-integer-size(8), 127::little-integer-size(8), rest::binary>>) do - {:NaN, rest} - end - - defp type(@type_float, <>) do - {float, rest} - end - - defp type(@type_string, <>) do - size = size - 1 - <> = rest - {string, rest} - end - - defp type(@type_document, binary) do - document(binary) - end - - defp type(@type_array, binary) do - list(binary) - end - - defp type(@type_binary, <<_size::int32(), subtype, length::int32(), binary::binary(length), rest::binary>>) when subtype == 0x02 do - subtype = subtype(subtype) - {%BSON.Binary{binary: binary, subtype: subtype}, rest} - end - - defp type(@type_binary, <>) do - subtype = subtype(subtype) - {%BSON.Binary{binary: binary, subtype: subtype}, rest} - end - - defp type(@type_objectid, <>) do - {%BSON.ObjectId{value: binary}, rest} - end - - defp type(@type_bool, <<0x00, rest::binary>>) do - {false, rest} - end - - defp type(@type_bool, <<0x01, rest::binary>>) do - {true, rest} - end - - defp type(@type_datetime, <>) do - {DateTime.from_unix!(unix_ms, :millisecond), rest} - end - - defp type(@type_undefined, rest) do - {nil, rest} - end - - defp type(@type_null, rest) do - {nil, rest} - end - - defp type(@type_regex, binary) do - {pattern, rest} = cstring(binary) - {options, rest} = cstring(rest) - {%BSON.Regex{pattern: pattern, options: options}, rest} - end - - defp type(@type_js, binary) do - {code, rest} = type(@type_string, binary) - {%BSON.JavaScript{code: code}, rest} - end - - defp type(@type_symbol, binary) do - type(@type_string, binary) - end - - defp type(@type_js_scope, <>) do - size = size - 4 - <> = binary - {code, binary} = type(@type_string, binary) - {scope, ""} = document(binary) - {%BSON.JavaScript{code: code, scope: scope}, rest} - end - - defp type(@type_int32, <>) do - {int, rest} - end - - defp type(@type_timestamp, <>) do - {%BSON.Timestamp{value: epoch, ordinal: ordinal}, rest} - end - - defp type(@type_int64, <>) do - {int, rest} - end - - defp type(@type_decimal128, <>) do - {Decimal128.decode(bits), rest} - end - - defp type(@type_min, rest) do - {:BSON_min, rest} - end - defp type(@type_max, rest) do - {:BSON_max, rest} + defmacro __using__(opts) do + quote bind_quoted: [opts: opts] do + use BSON.Utils + alias BSON.Decimal128 + + @preserve_order opts[:preserve_order] || false + @compile {:inline, cstring: 1} + + def decode(binary) do + {map, ""} = document(binary) + map + end + + def documents(binary) do + documents(binary, []) + end + + def documents("", acc) do + Enum.reverse(acc) + end + + def documents(binary, acc) do + {doc, rest} = document(binary) + documents(rest, [doc | acc]) + end + + def document(<>) do + size = size - 5 + <> = rest + + {doc_fields(doc, []), rest} + end + + defp doc_fields(<>, acc) do + {key, rest} = cstring(rest) + {value, rest} = type(type, rest) + + doc_fields(rest, [{key, value} | acc]) + end + + if @preserve_order == false do + defp doc_fields("", acc) do + Map.new(acc) + end + else + defp doc_fields("", acc) do + acc + |> Map.new() + |> Map.put(@preserve_order, Enum.map(acc, fn {key, _value} -> key end) |> Enum.reverse()) + end + end + + defp list(<>) do + size = size - 5 + <> = rest + + {list_elems(list, []), rest} + end + + defp list_elems(<>, acc) do + {_ignored, rest} = cstring(rest) + {value, rest} = type(type, rest) + + list_elems(rest, [value | acc]) + end + + defp list_elems("", acc) do + Enum.reverse(acc) + end + + defp cstring(binary) do + split(binary, []) + end + + defp split(<<0x00, rest::binary>>, acc) do + {acc |> Enum.reverse() |> :binary.list_to_bin(), rest} + end + + defp split(<>, acc) do + split(rest, [byte | acc]) + end + + defp subtype(0x00), do: :generic + defp subtype(0x01), do: :function + defp subtype(0x02), do: :binary_old + defp subtype(0x03), do: :uuid_old + defp subtype(0x04), do: :uuid + defp subtype(0x05), do: :md5 + defp subtype(int) when is_integer(int) and int in 0x80..0xFF, do: int + + defp type(@type_string, <>) do + size = size - 1 + <> = rest + {string, rest} + end + + defp type(@type_document, binary) do + document(binary) + end + + defp type(@type_array, binary) do + list(binary) + end + + defp type(@type_binary, <<_size::int32(), subtype, length::int32(), binary::binary(length), rest::binary>>) when subtype == 0x02 do + subtype = subtype(subtype) + {%BSON.Binary{binary: binary, subtype: subtype}, rest} + end + + defp type(@type_binary, <>) do + subtype = subtype(subtype) + {%BSON.Binary{binary: binary, subtype: subtype}, rest} + end + + defp type(@type_objectid, <>) do + {%BSON.ObjectId{value: binary}, rest} + end + + defp type(@type_bool, <<0x00, rest::binary>>) do + {false, rest} + end + + defp type(@type_bool, <<0x01, rest::binary>>) do + {true, rest} + end + + defp type(@type_datetime, <>) do + {DateTime.from_unix!(unix_ms, :millisecond), rest} + end + + defp type(@type_undefined, rest) do + {nil, rest} + end + + defp type(@type_null, rest) do + {nil, rest} + end + + defp type(@type_regex, binary) do + {pattern, rest} = cstring(binary) + {options, rest} = cstring(rest) + {%BSON.Regex{pattern: pattern, options: options}, rest} + end + + defp type(@type_js, binary) do + {code, rest} = type(@type_string, binary) + {%BSON.JavaScript{code: code}, rest} + end + + defp type(@type_symbol, binary) do + type(@type_string, binary) + end + + defp type(@type_js_scope, <>) do + size = size - 4 + <> = binary + {code, binary} = type(@type_string, binary) + {scope, ""} = document(binary) + {%BSON.JavaScript{code: code, scope: scope}, rest} + end + + defp type(@type_int32, <>) do + {int, rest} + end + + defp type(@type_timestamp, <>) do + {%BSON.Timestamp{value: epoch, ordinal: ordinal}, rest} + end + + defp type(@type_int64, <>) do + {int, rest} + end + + defp type(@type_decimal128, <>) do + {Decimal128.decode(bits), rest} + end + + defp type(@type_float, <<0, 0, 0, 0, 0, 0, 240::little-integer-size(8), 127::little-integer-size(8), rest::binary>>) do + {:inf, rest} + end + + defp type(@type_float, <<0, 0, 0, 0, 0, 0, 240::little-integer-size(8), 255::little-integer-size(8), rest::binary>>) do + {:"-inf", rest} + end + + defp type(@type_float, <<0, 0, 0, 0, 0, 0, 248::little-integer-size(8), 127::little-integer-size(8), rest::binary>>) do + {:NaN, rest} + end + + defp type(@type_float, <<1, 0, 0, 0, 0, 0, 240::little-integer-size(8), 127::little-integer-size(8), rest::binary>>) do + {:NaN, rest} + end + + defp type(@type_float, <>) do + {float, rest} + end + + defp type(@type_min, rest) do + {:BSON_min, rest} + end + + defp type(@type_max, rest) do + {:BSON_max, rest} + end + end end +end - def document(<>) do - size = size - 5 - <> = rest - - {doc_fields(doc, []), rest} - end - - defp doc_fields(<>, acc) do - {key, rest} = cstring(rest) - {value, rest} = type(type, rest) - - doc_fields(rest, [{key, value} | acc]) - end - - defp doc_fields("", acc) do - acc |> Enum.reverse() |> Enum.into(%{}) - end - - defp list(<>) do - size = size - 5 - <> = rest - - {list_elems(list, 0, []), rest} - end +defmodule BSON.Decoder do + # This module provides functions for decoding BSON data into Elixir values. + # The data type conversions are documented at https://hexdocs.pm/mongodb_driver/readme.html#data-representation. - defp list_elems(<>, ix, acc) do - ix_string = Integer.to_string(ix) - {^ix_string, rest} = cstring(rest) - {value, rest} = type(type, rest) + @moduledoc false - list_elems(rest, ix + 1, [value | acc]) - end + use BSON.DecoderGenerator, preserve_order: false +end - defp list_elems("", _ix, acc) do - acc |> Enum.reverse() - end +defmodule BSON.PreserveOrderDecoder do + # This module is like `BSON.Decoder`, but it retains the original order of + # document keys in a list. - defp cstring(binary) do - [string, rest] = :binary.split(binary, <<0x00>>) - {string, rest} - end + @moduledoc false - defp subtype(0x00), do: :generic - defp subtype(0x01), do: :function - defp subtype(0x02), do: :binary_old - defp subtype(0x03), do: :uuid_old - defp subtype(0x04), do: :uuid - defp subtype(0x05), do: :md5 - defp subtype(int) when is_integer(int) and int in 0x80..0xFF, do: int + use BSON.DecoderGenerator, preserve_order: :__order__ end diff --git a/lib/mongo/messages.ex b/lib/mongo/messages.ex index 8a279411..b5ed10df 100644 --- a/lib/mongo/messages.ex +++ b/lib/mongo/messages.ex @@ -47,6 +47,8 @@ defmodule Mongo.Messages do defrecord :section, [:payload_type, :payload] defrecord :op_msg, [:flags, :sections] + @decoder_module Application.compile_env(:mongodb_driver, :decoder, BSON.Decoder) + @doc """ Decodes the header from response of a request sent by the mongodb server """ @@ -72,44 +74,53 @@ defmodule Mongo.Messages do @doc """ Decodes the response body of a request sent by the mongodb server """ - def decode_response(msg_header(length: length) = header, iolist) when is_list(iolist) do - case IO.iodata_length(iolist) >= length do - true -> decode_response(header, IO.iodata_to_binary(iolist)) - false -> :error - end - end - - def decode_response(msg_header(length: length, response_to: response_to, op_code: op_code), binary) when byte_size(binary) >= length do + def decode_response(msg_header(length: length, response_to: response_to, op_code: op_code), binary) when is_binary(binary) and byte_size(binary) >= length do <> = binary case op_code do - @op_reply -> {:ok, response_to, decode_reply(response), rest} - @op_msg_code -> {:ok, response_to, decode_msg(response), rest} - _ -> :error + @op_reply -> + {:ok, response_to, decode_reply(response), rest} + + @op_msg_code -> + {:ok, response_to, decode_msg(response), rest} + + _ -> + :error end end - def decode_response(_header, _binary), do: :error + def decode_response(header, iolist) when is_list(iolist) do + decode_response(header, IO.iodata_to_binary(iolist)) + end + + def decode_response(_header, _binary) do + :error + end @doc """ Decodes a reply message from the response """ def decode_reply(<>) do - op_reply(flags: flags, cursor_id: cursor_id, from: from, num: num, docs: BSON.Decoder.documents(rest)) + op_reply(flags: flags, cursor_id: cursor_id, from: from, num: num, docs: @decoder_module.documents(rest)) end def decode_msg(<>) do op_msg(flags: flags, sections: decode_sections(rest)) end - def decode_sections(binary), do: decode_sections(binary, []) - def decode_sections("", acc), do: Enum.reverse(acc) + def decode_sections(binary) do + decode_sections(binary, []) + end + + def decode_sections("", acc) do + Enum.reverse(acc) + end def decode_sections(<<0x00::int8(), payload::binary>>, acc) do <> = payload <> = payload - with {doc, ""} <- BSON.Decoder.document(doc) do + with {doc, ""} <- @decoder_module.document(doc) do decode_sections(rest, [section(payload_type: 0, payload: payload(doc: doc)) | acc]) end end @@ -122,13 +133,20 @@ defmodule Mongo.Messages do def decode_sequence(<>) do with {identifier, docs} <- cstring(rest) do - sequence(size: size, identifier: identifier, docs: BSON.Decoder.documents(docs)) + sequence(size: size, identifier: identifier, docs: @decoder_module.documents(docs)) end end defp cstring(binary) do - [string, rest] = :binary.split(binary, <<0x00>>) - {string, rest} + split(binary, []) + end + + defp split(<<0x00, rest::binary>>, acc) do + {acc |> Enum.reverse() |> :binary.list_to_bin(), rest} + end + + defp split(<>, acc) do + split(rest, [byte | acc]) end def encode(request_id, op_query() = op) do diff --git a/mix.lock b/mix.lock index e2298812..e0a18ba7 100644 --- a/mix.lock +++ b/mix.lock @@ -28,6 +28,7 @@ "patch": {:hex, :patch, "0.12.0", "2da8967d382bade20344a3e89d618bfba563b12d4ac93955468e830777f816b0", [:mix], [], "hexpm", "ffd0e9a7f2ad5054f37af84067ee88b1ad337308a1cb227e181e3967127b0235"}, "poolboy": {:hex, :poolboy, "1.5.2", "392b007a1693a64540cead79830443abf5762f5d30cf50bc95cb2c1aaafa006b", [:rebar3], [], "hexpm"}, "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.5", "6eaf7ad16cb568bb01753dbbd7a95ff8b91c7979482b95f38443fe2c8852a79b", [:make, :mix, :rebar3], [], "hexpm", "13104d7897e38ed7f044c4de953a6c28597d1c952075eb2e328bc6d6f2bfc496"}, + "statistex": {:hex, :statistex, "1.0.0", "f3dc93f3c0c6c92e5f291704cf62b99b553253d7969e9a5fa713e5481cd858a5", [:mix], [], "hexpm", "ff9d8bee7035028ab4742ff52fc80a2aa35cece833cf5319009b52f1b5a86c27"}, "telemetry": {:hex, :telemetry, "1.2.1", "68fdfe8d8f05a8428483a97d7aab2f268aaff24b49e0f599faa091f1d4e7f61c", [:rebar3], [], "hexpm", "dad9ce9d8effc621708f99eac538ef1cbe05d6a874dd741de2e689c47feafed5"}, "unicode_util_compat": {:hex, :unicode_util_compat, "0.4.1", "d869e4c68901dd9531385bb0c8c40444ebf624e60b6962d95952775cac5e90cd", [:rebar3], [], "hexpm", "1d1848c40487cdb0b30e8ed975e34e025860c02e419cb615d255849f3427439d"}, } diff --git a/test/bson/decoder_test.exs b/test/bson/decoder_test.exs new file mode 100644 index 00000000..d79fa07f --- /dev/null +++ b/test/bson/decoder_test.exs @@ -0,0 +1,115 @@ +defmodule BSON.DecoderTest.CustomPreserveOrderDecoder do + use BSON.DecoderGenerator, preserve_order: :original_order +end + +defmodule BSON.DecoderTest.MapWithOrder do + def to_list(doc, order_key \\ :__order__) do + do_to_list(doc, order_key) + end + + defp do_to_list(%{__struct__: _} = elem, _order_key) do + elem + end + + defp do_to_list(doc, order_key) when is_map(doc) do + doc + |> Map.get(order_key, Map.keys(doc)) + |> Enum.map(fn key -> {key, do_to_list(Map.get(doc, key), order_key)} end) + end + + defp do_to_list(xs, order_key) when is_list(xs) do + Enum.map(xs, fn elem -> do_to_list(elem, order_key) end) + end + + defp do_to_list(elem, _order_key) do + elem + end +end + +defmodule BSON.DecoderTest do + use ExUnit.Case, async: true + + # { + # "key1": { + # "a": 1, + # "b": 2, + # "c": 3 + # }, + # "key2": { + # "x": 4, + # "y": 5 + # } + # } + @bson_document <<62, 0, 0, 0, 3, 107, 101, 121, 49, 0, 26, 0, 0, 0, 16, 97, 0, 1, 0, 0, 0, 16, 98, 0, 2, 0, 0, 0, 16, 99, 0, 3, 0, 0, 0, 0, 3, 107, 101, 121, 50, 0, 19, 0, 0, 0, 16, 120, 0, 4, 0, 0, 0, 16, 121, 0, 5, 0, 0, 0, 0, 0>> + + describe "BSON.Decoder.decode/1" do + test "decodes binary data into a map" do + assert BSON.Decoder.decode(@bson_document) == %{ + "key1" => %{ + "a" => 1, + "b" => 2, + "c" => 3 + }, + "key2" => %{ + "x" => 4, + "y" => 5 + } + } + end + end + + describe "BSON.PreserveOrderDecoder.decode/1" do + test "decodes binary data into a map with :__order__" do + assert BSON.PreserveOrderDecoder.decode(@bson_document) == %{ + "key1" => %{ + "a" => 1, + "b" => 2, + "c" => 3, + __order__: ["a", "b", "c"] + }, + "key2" => %{ + "x" => 4, + "y" => 5, + __order__: ["x", "y"] + }, + __order__: ["key1", "key2"] + } + end + + test "decodes binary data into a map with custom key" do + assert BSON.DecoderTest.CustomPreserveOrderDecoder.decode(@bson_document) == %{ + "key1" => %{ + "a" => 1, + "b" => 2, + "c" => 3, + original_order: ["a", "b", "c"] + }, + "key2" => %{ + "x" => 4, + "y" => 5, + original_order: ["x", "y"] + }, + original_order: ["key1", "key2"] + } + end + end + + test "annotated maps can be converted to lists" do + ordered_list = + %{ + "_id" => BSON.ObjectId.new(1, 2, 3, 4), + "user" => %{ + "name" => "John Doe", + "age" => 42, + __order__: ["name", "age"] + }, + __order__: ["_id", "user"] + } + |> BSON.DecoderTest.MapWithOrder.to_list() + + assert ordered_list == [ + {"_id", BSON.ObjectId.new(1, 2, 3, 4)}, + {"user", [{"name", "John Doe"}, {"age", 42}]} + ] + end +end