diff --git a/changelog.d/4699.bugfix b/changelog.d/4699.bugfix
new file mode 100644
index 0000000000..1d7f3174e7
--- /dev/null
+++ b/changelog.d/4699.bugfix
@@ -0,0 +1 @@
+Fix attempting to paginate in rooms where server cannot see any events, to avoid unnecessarily pulling in lots of redacted events.
diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py
index f80486102a..72b63d64d0 100644
--- a/synapse/handlers/federation.py
+++ b/synapse/handlers/federation.py
@@ -858,6 +858,52 @@ class FederationHandler(BaseHandler):
logger.debug("Not backfilling as no extremeties found.")
return
+ # We only want to paginate if we can actually see the events we'll get,
+ # as otherwise we'll just spend a lot of resources to get redacted
+ # events.
+ #
+ # We do this by filtering all the backwards extremities and seeing if
+ # any remain. Given we don't have the extremity events themselves, we
+ # need to actually check the events that reference them.
+ #
+ # *Note*: the spec wants us to keep backfilling until we reach the start
+ # of the room in case we are allowed to see some of the history. However
+ # in practice that causes more issues than its worth, as a) its
+ # relatively rare for there to be any visible history and b) even when
+ # there is its often sufficiently long ago that clients would stop
+ # attempting to paginate before backfill reached the visible history.
+ #
+ # TODO: If we do do a backfill then we should filter the backwards
+ # extremities to only include those that point to visible portions of
+ # history.
+ #
+ # TODO: Correctly handle the case where we are allowed to see the
+ # forward event but not the backward extremity, e.g. in the case of
+ # initial join of the server where we are allowed to see the join
+ # event but not anything before it. This would require looking at the
+ # state *before* the event, ignoring the special casing certain event
+ # types have.
+
+ forward_events = yield self.store.get_successor_events(
+ list(extremities),
+ )
+
+ extremities_events = yield self.store.get_events(
+ forward_events,
+ check_redacted=False,
+ get_prev_content=False,
+ )
+
+ # We set `check_history_visibility_only` as we might otherwise get false
+ # positives from users having been erased.
+ filtered_extremities = yield filter_events_for_server(
+ self.store, self.server_name, list(extremities_events.values()),
+ redact=False, check_history_visibility_only=True,
+ )
+
+ if not filtered_extremities:
+ defer.returnValue(False)
+
# Check if we reached a point where we should start backfilling.
sorted_extremeties_tuple = sorted(
extremities.items(),
diff --git a/synapse/storage/event_federation.py b/synapse/storage/event_federation.py
index 38809ed0fc..a8d90456e3 100644
--- a/synapse/storage/event_federation.py
+++ b/synapse/storage/event_federation.py
@@ -442,6 +442,28 @@ class EventFederationWorkerStore(EventsWorkerStore, SignatureWorkerStore,
event_results.reverse()
return event_results
+ @defer.inlineCallbacks
+ def get_successor_events(self, event_ids):
+ """Fetch all events that have the given events as a prev event
+
+ Args:
+ event_ids (iterable[str])
+
+ Returns:
+ Deferred[list[str]]
+ """
+ rows = yield self._simple_select_many_batch(
+ table="event_edges",
+ column="prev_event_id",
+ iterable=event_ids,
+ retcols=("event_id",),
+ desc="get_successor_events"
+ )
+
+ defer.returnValue([
+ row["event_id"] for row in rows
+ ])
+
class EventFederationStore(EventFederationWorkerStore):
""" Responsible for storing and serving up the various graphs associated
diff --git a/synapse/visibility.py b/synapse/visibility.py
index 0281a7c919..efec21673b 100644
--- a/synapse/visibility.py
+++ b/synapse/visibility.py
@@ -216,28 +216,36 @@ def filter_events_for_client(store, user_id, events, is_peeking=False,
@defer.inlineCallbacks
-def filter_events_for_server(store, server_name, events):
- # Whatever else we do, we need to check for senders which have requested
- # erasure of their data.
- erased_senders = yield store.are_users_erased(
- (e.sender for e in events),
- )
+def filter_events_for_server(store, server_name, events, redact=True,
+ check_history_visibility_only=False):
+ """Filter a list of events based on whether given server is allowed to
+ see them.
- def redact_disallowed(event, state):
- # if the sender has been gdpr17ed, always return a redacted
- # copy of the event.
- if erased_senders[event.sender]:
+ Args:
+ store (DataStore)
+ server_name (str)
+ events (iterable[FrozenEvent])
+ redact (bool): Whether to return a redacted version of the event, or
+ to filter them out entirely.
+ check_history_visibility_only (bool): Whether to only check the
+ history visibility, rather than things like if the sender has been
+ erased. This is used e.g. during pagination to decide whether to
+ backfill or not.
+
+ Returns
+ Deferred[list[FrozenEvent]]
+ """
+
+ def is_sender_erased(event, erased_senders):
+ if erased_senders and erased_senders[event.sender]:
logger.info(
"Sender of %s has been erased, redacting",
event.event_id,
)
- return prune_event(event)
-
- # state will be None if we decided we didn't need to filter by
- # room membership.
- if not state:
- return event
+ return True
+ return False
+ def check_event_is_visible(event, state):
history = state.get((EventTypes.RoomHistoryVisibility, ''), None)
if history:
visibility = history.content.get("history_visibility", "shared")
@@ -259,17 +267,17 @@ def filter_events_for_server(store, server_name, events):
memtype = ev.membership
if memtype == Membership.JOIN:
- return event
+ return True
elif memtype == Membership.INVITE:
if visibility == "invited":
- return event
+ return True
else:
# server has no users in the room: redact
- return prune_event(event)
+ return False
- return event
+ return True
- # Next lets check to see if all the events have a history visibility
+ # Lets check to see if all the events have a history visibility
# of "shared" or "world_readable". If thats the case then we don't
# need to check membership (as we know the server is in the room).
event_to_state_ids = yield store.get_state_ids_for_events(
@@ -296,16 +304,31 @@ def filter_events_for_server(store, server_name, events):
for e in itervalues(event_map)
)
+ if not check_history_visibility_only:
+ erased_senders = yield store.are_users_erased(
+ (e.sender for e in events),
+ )
+ else:
+ # We don't want to check whether users are erased, which is equivalent
+ # to no users having been erased.
+ erased_senders = {}
+
if all_open:
# all the history_visibility state affecting these events is open, so
# we don't need to filter by membership state. We *do* need to check
# for user erasure, though.
if erased_senders:
- events = [
- redact_disallowed(e, None)
- for e in events
- ]
+ to_return = []
+ for e in events:
+ if not is_sender_erased(e, erased_senders):
+ to_return.append(e)
+ elif redact:
+ to_return.append(prune_event(e))
+
+ defer.returnValue(to_return)
+ # If there are no erased users then we can just return the given list
+ # of events without having to copy it.
defer.returnValue(events)
# Ok, so we're dealing with events that have non-trivial visibility
@@ -361,7 +384,13 @@ def filter_events_for_server(store, server_name, events):
for e_id, key_to_eid in iteritems(event_to_state_ids)
}
- defer.returnValue([
- redact_disallowed(e, event_to_state[e.event_id])
- for e in events
- ])
+ to_return = []
+ for e in events:
+ erased = is_sender_erased(e, erased_senders)
+ visible = check_event_is_visible(e, event_to_state[e.event_id])
+ if visible and not erased:
+ to_return.append(e)
+ elif redact:
+ to_return.append(prune_event(e))
+
+ defer.returnValue(to_return)
|