Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Safe async event cache #13308

Merged
merged 14 commits into from
Jul 19, 2022
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/13308.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix async get event cache invalidation logic. Contributed by Nick @ Beeper (@fizzadar).
9 changes: 8 additions & 1 deletion synapse/storage/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ def _attempt_to_invalidate_cache(
cache doesn't exist. Mainly used for invalidating caches on workers,
where they may not have the cache.

Note that this function does not invalidate any remote caches, only the
local in-memory ones. Any remote invalidation must be performed before
calling this.

Args:
cache_name
key: Entry to invalidate. If None then invalidates the entire
Expand All @@ -112,7 +116,10 @@ def _attempt_to_invalidate_cache(
if key is None:
cache.invalidate_all()
else:
cache.invalidate(tuple(key))
# Prefer any local-only invalidation method. Invalidating any non-local
# cache must be be done before this.
invalidate_method = getattr(cache, "invalidate_local", cache.invalidate)
invalidate_method(tuple(key))


def db_to_json(db_content: Union[memoryview, bytes, bytearray, str]) -> Any:
Expand Down
40 changes: 31 additions & 9 deletions synapse/storage/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from typing import (
TYPE_CHECKING,
Any,
Awaitable,
Callable,
Collection,
Dict,
Expand All @@ -33,6 +34,7 @@
Tuple,
Type,
TypeVar,
Union,
cast,
overload,
)
Expand All @@ -57,7 +59,7 @@
from synapse.storage.background_updates import BackgroundUpdater
from synapse.storage.engines import BaseDatabaseEngine, PostgresEngine, Sqlite3Engine
from synapse.storage.types import Connection, Cursor
from synapse.util.async_helpers import delay_cancellation, maybe_awaitable
from synapse.util.async_helpers import delay_cancellation
from synapse.util.iterutils import batch_iter

if TYPE_CHECKING:
Expand Down Expand Up @@ -208,7 +210,9 @@ def __getattr__(self, name: str) -> Any:


# The type of entry which goes on our after_callbacks and exception_callbacks lists.
_CallbackListEntry = Tuple[Callable[..., object], Tuple[object, ...], Dict[str, object]]
_CallbackListEntry = Tuple[
Callable[..., Union[object, Awaitable]], Tuple[object, ...], Dict[str, object]
]

P = ParamSpec("P")
R = TypeVar("R")
Expand Down Expand Up @@ -796,6 +800,29 @@ async def runInteraction(
The result of func
"""

async def _run_callbacks(callbacks: List[_CallbackListEntry]):
"""
This function takes a list of mixed sync/async callbacks and executes
the async ones first and then the sync callbacks.

We do this with the assumption that async functions call out to external
systems (e.g. to invalidate a cache) and the sync functions make these
changes on any local in-memory caches/similar, and thus must be second.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMHO this second paragraph belongs as a comment rather than a docstring. It's not really the place of a docstring to describe the reasons behind implementation decisions within a method.

"""

sync_callbacks: List[_CallbackListEntry] = []

for cb, args, kwargs in callbacks:
if inspect.iscoroutinefunction(cb):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a good reason to limit this to coroutine functions, rather than other awaitables? I think inspect.isawaitable(cb) would be better?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's what I did first but isawaitable is only true on the return value of the function (isawaitable(cb())) rather than the function def itself:

In [1]: async def function():
   ...:     pass

In [2]: class Test:
   ...:     async def method():
   ...:         pass
 
In [4]: inspect.isawaitable(function)
Out[4]: False
In [5]: inspect.isawaitable(Test.method)
Out[5]: False
In [6]: inspect.iscoroutine(function)
Out[6]: False
In [7]: inspect.iscoroutine(Test.method)
Out[7]: False
In [8]: inspect.iscoroutinefunction(function)
Out[8]: True
In [9]: inspect.iscoroutinefunction(Test.method)
Out[9]: True

Perhaps this should be changed to also check awaitables?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh sorry, brainfart on my part. As you say: coroutine functions aren't themselves awaitable; they return an awaitable the first time you call them.

Unfortunately, they aren't the only thing that can return awaitables (eg: old-style functions that just return a Twisted Deferred object) and we need to do the same thing with other such functions. And you can't tell whether that is the case (other than via typing annotations) until you call the function in question and see what you get back - which of course doesn't work here.

So I think the only correct way to do this is to have separate txn.async_call_after which updates a separate list of async callbacks.

awaitable = cb(*args, **kwargs)
assert isinstance(awaitable, Awaitable)
await awaitable
else:
sync_callbacks.append((cb, args, kwargs))

for cb, args, kwargs in sync_callbacks:
cb(*args, **kwargs)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add some comments here please? What is going on here, and why?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a docstring to describe it - should this be hoisted to a class method now perhaps? b8ea213

Copy link
Member

@richvdh richvdh Jul 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be hoisted to a class method now perhaps?

or, better, a top-level function. Though I suspect the changes regarding async_call_after mean this function isn't going to be useful any more.


async def _runInteraction() -> R:
after_callbacks: List[_CallbackListEntry] = []
exception_callbacks: List[_CallbackListEntry] = []
Expand All @@ -817,15 +844,10 @@ async def _runInteraction() -> R:
**kwargs,
)

for after_callback, after_args, after_kwargs in after_callbacks:
await maybe_awaitable(after_callback(*after_args, **after_kwargs))

await _run_callbacks(after_callbacks)
return cast(R, result)
except Exception:
for exception_callback, after_args, after_kwargs in exception_callbacks:
await maybe_awaitable(
exception_callback(*after_args, **after_kwargs)
)
await _run_callbacks(exception_callbacks)
raise

# To handle cancellation, we ensure that `after_callback`s and
Expand Down
2 changes: 1 addition & 1 deletion synapse/storage/databases/main/censor_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def delete_expired_event_txn(txn: LoggingTransaction) -> None:
# changed its content in the database. We can't call
# self._invalidate_cache_and_stream because self.get_event_cache isn't of the
# right type.
txn.call_after(self._get_event_cache.invalidate, (event.event_id,))
self.invalidate_get_event_cache_after_txn(txn, event.event_id)
# Send that invalidation to replication so that other workers also invalidate
# the event cache.
self._send_invalidation_to_replication(
Expand Down
4 changes: 2 additions & 2 deletions synapse/storage/databases/main/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -1293,7 +1293,7 @@ def _update_room_depths_txn(
depth_updates: Dict[str, int] = {}
for event, context in events_and_contexts:
# Remove the any existing cache entries for the event_ids
txn.call_after(self.store._invalidate_get_event_cache, event.event_id)
self.store.invalidate_get_event_cache_after_txn(txn, event.event_id)
# Then update the `stream_ordering` position to mark the latest
# event as the front of the room. This should not be done for
# backfilled events because backfilled events have negative
Expand Down Expand Up @@ -1684,7 +1684,7 @@ def _store_redaction(self, txn: LoggingTransaction, event: EventBase) -> None:
_invalidate_caches_for_event.
"""
assert event.redacts is not None
txn.call_after(self.store._invalidate_get_event_cache, event.redacts)
self.store.invalidate_get_event_cache_after_txn(txn, event.redacts)
txn.call_after(self.store.get_relations_for_event.invalidate, (event.redacts,))
txn.call_after(self.store.get_applicable_edit.invalidate, (event.redacts,))

Expand Down
40 changes: 32 additions & 8 deletions synapse/storage/databases/main/events_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,17 +712,41 @@ async def get_missing_events_from_db() -> Dict[str, EventCacheEntry]:

return event_entry_map

async def _invalidate_get_event_cache(self, event_id: str) -> None:
# First we invalidate the asynchronous cache instance. This may include
# out-of-process caches such as Redis/memcache. Once complete we can
# invalidate any in memory cache. The ordering is important here to
# ensure we don't pull in any remote invalid value after we invalidate
# the in-memory cache.
def invalidate_get_event_cache_after_txn(
self, txn: LoggingTransaction, event_id: str
) -> None:
"""
Prepares a database transaction to invalidate the get event cache for a given
event ID when executed successfully. This is achieved by attaching two callbacks
to the transaction, one to invalidate the async cache and one for the in memory
sync cache (importantly called in that order).

Arguments:
txn: the database transaction to attach the callbacks to
event_id: the event ID to be invalidated from caches
"""

txn.call_after(self._invalidate_async_get_event_cache, event_id)
txn.call_after(self._invalidate_local_get_event_cache, event_id)

async def _invalidate_async_get_event_cache(self, event_id: str) -> None:
"""
Invalidates an event in the asyncronous get event cache, which may be remote.

Arguments:
event_id: the event ID to invalidate
"""

await self._get_event_cache.invalidate((event_id,))
self._event_ref.pop(event_id, None)
self._current_event_fetches.pop(event_id, None)

def _invalidate_local_get_event_cache(self, event_id: str) -> None:
"""
Invalidates an event in local in-memory get event caches.

Arguments:
event_id: the event ID to invalidate
"""

self._get_event_cache.invalidate_local((event_id,))
self._event_ref.pop(event_id, None)
self._current_event_fetches.pop(event_id, None)
Expand Down
2 changes: 1 addition & 1 deletion synapse/storage/databases/main/purge_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ def _purge_history_txn(
self._invalidate_cache_and_stream(
txn, self.have_seen_event, (room_id, event_id)
)
txn.call_after(self._invalidate_get_event_cache, event_id)
self.invalidate_get_event_cache_after_txn(txn, event_id)

logger.info("[purge] done")

Expand Down