diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py
index 57ad6e5dce..d92582fd5c 100644
--- a/synapse/handlers/federation.py
+++ b/synapse/handlers/federation.py
@@ -32,18 +32,20 @@ from typing import (
)
import attr
+from prometheus_client import Histogram
from signedjson.key import decode_verify_key_bytes
from signedjson.sign import verify_signed_json
from unpaddedbase64 import decode_base64
from synapse import event_auth
-from synapse.api.constants import EventContentFields, EventTypes, Membership
+from synapse.api.constants import MAX_DEPTH, EventContentFields, EventTypes, Membership
from synapse.api.errors import (
AuthError,
CodeMessageException,
Codes,
FederationDeniedError,
FederationError,
+ FederationPullAttemptBackoffError,
HttpResponseException,
LimitExceededError,
NotFoundError,
@@ -59,6 +61,7 @@ from synapse.events.validator import EventValidator
from synapse.federation.federation_client import InvalidResponseError
from synapse.http.servlet import assert_params_in_dict
from synapse.logging.context import nested_logging_context
+from synapse.logging.opentracing import SynapseTags, set_tag, tag_args, trace
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.module_api import NOT_SPAM
from synapse.replication.http.federation import (
@@ -68,7 +71,7 @@ from synapse.replication.http.federation import (
from synapse.storage.databases.main.events import PartialStateConflictError
from synapse.storage.databases.main.events_worker import EventRedactBehaviour
from synapse.storage.state import StateFilter
-from synapse.types import JsonDict, StateMap, get_domain_from_id
+from synapse.types import JsonDict, get_domain_from_id
from synapse.util.async_helpers import Linearizer
from synapse.util.retryutils import NotRetryingDestination
from synapse.visibility import filter_events_for_server
@@ -78,36 +81,28 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__)
-
-def get_domains_from_state(state: StateMap[EventBase]) -> List[Tuple[str, int]]:
- """Get joined domains from state
-
- Args:
- state: State map from type/state key to event.
-
- Returns:
- Returns a list of servers with the lowest depth of their joins.
- Sorted by lowest depth first.
- """
- joined_users = [
- (state_key, int(event.depth))
- for (e_type, state_key), event in state.items()
- if e_type == EventTypes.Member and event.membership == Membership.JOIN
- ]
-
- joined_domains: Dict[str, int] = {}
- for u, d in joined_users:
- try:
- dom = get_domain_from_id(u)
- old_d = joined_domains.get(dom)
- if old_d:
- joined_domains[dom] = min(d, old_d)
- else:
- joined_domains[dom] = d
- except Exception:
- pass
-
- return sorted(joined_domains.items(), key=lambda d: d[1])
+# Added to debug performance and track progress on optimizations
+backfill_processing_before_timer = Histogram(
+ "synapse_federation_backfill_processing_before_time_seconds",
+ "sec",
+ [],
+ buckets=(
+ 0.1,
+ 0.5,
+ 1.0,
+ 2.5,
+ 5.0,
+ 7.5,
+ 10.0,
+ 15.0,
+ 20.0,
+ 30.0,
+ 40.0,
+ 60.0,
+ 80.0,
+ "+Inf",
+ ),
+)
class _BackfillPointType(Enum):
@@ -137,6 +132,7 @@ class FederationHandler:
def __init__(self, hs: "HomeServer"):
self.hs = hs
+ self.clock = hs.get_clock()
self.store = hs.get_datastores().main
self._storage_controllers = hs.get_storage_controllers()
self._state_storage_controller = self._storage_controllers.state
@@ -154,6 +150,8 @@ class FederationHandler:
self.http_client = hs.get_proxied_blacklisted_http_client()
self._replication = hs.get_replication_data_handler()
self._federation_event_handler = hs.get_federation_event_handler()
+ self._device_handler = hs.get_device_handler()
+ self._bulk_push_rule_evaluator = hs.get_bulk_push_rule_evaluator()
self._clean_room_for_join_client = ReplicationCleanRoomRestServlet.make_client(
hs
@@ -180,6 +178,7 @@ class FederationHandler:
"resume_sync_partial_state_room", self._resume_sync_partial_state_room
)
+ @trace
async def maybe_backfill(
self, room_id: str, current_depth: int, limit: int
) -> bool:
@@ -195,16 +194,52 @@ class FederationHandler:
return. This is used as part of the heuristic to decide if we
should back paginate.
"""
+ # Starting the processing time here so we can include the room backfill
+ # linearizer lock queue in the timing
+ processing_start_time = self.clock.time_msec()
+
async with self._room_backfill.queue(room_id):
- return await self._maybe_backfill_inner(room_id, current_depth, limit)
+ return await self._maybe_backfill_inner(
+ room_id,
+ current_depth,
+ limit,
+ processing_start_time=processing_start_time,
+ )
async def _maybe_backfill_inner(
- self, room_id: str, current_depth: int, limit: int
+ self,
+ room_id: str,
+ current_depth: int,
+ limit: int,
+ *,
+ processing_start_time: Optional[int],
) -> bool:
+ """
+ Checks whether the `current_depth` is at or approaching any backfill
+ points in the room and if so, will backfill. We only care about
+ checking backfill points that happened before the `current_depth`
+ (meaning less than or equal to the `current_depth`).
+
+ Args:
+ room_id: The room to backfill in.
+ current_depth: The depth to check at for any upcoming backfill points.
+ limit: The max number of events to request from the remote federated server.
+ processing_start_time: The time when `maybe_backfill` started processing.
+ Only used for timing. If `None`, no timing observation will be made.
+ """
backwards_extremities = [
_BackfillPoint(event_id, depth, _BackfillPointType.BACKWARDS_EXTREMITY)
- for event_id, depth in await self.store.get_oldest_event_ids_with_depth_in_room(
- room_id
+ for event_id, depth in await self.store.get_backfill_points_in_room(
+ room_id=room_id,
+ current_depth=current_depth,
+ # We only need to end up with 5 extremities combined with the
+ # insertion event extremities to make the `/backfill` request
+ # but fetch an order of magnitude more to make sure there is
+ # enough even after we filter them by whether visible in the
+ # history. This isn't fool-proof as all backfill points within
+ # our limit could be filtered out but seems like a good amount
+ # to try with at least.
+ limit=50,
)
]
@@ -213,7 +248,12 @@ class FederationHandler:
insertion_events_to_be_backfilled = [
_BackfillPoint(event_id, depth, _BackfillPointType.INSERTION_PONT)
for event_id, depth in await self.store.get_insertion_event_backward_extremities_in_room(
- room_id
+ room_id=room_id,
+ current_depth=current_depth,
+ # We only need to end up with 5 extremities combined with
+ # the backfill points to make the `/backfill` request ...
+ # (see the other comment above for more context).
+ limit=50,
)
]
logger.debug(
@@ -222,10 +262,6 @@ class FederationHandler:
insertion_events_to_be_backfilled,
)
- if not backwards_extremities and not insertion_events_to_be_backfilled:
- logger.debug("Not backfilling as no extremeties found.")
- return False
-
# we now have a list of potential places to backpaginate from. We prefer to
# start with the most recent (ie, max depth), so let's sort the list.
sorted_backfill_points: List[_BackfillPoint] = sorted(
@@ -246,6 +282,33 @@ class FederationHandler:
sorted_backfill_points,
)
+ # If we have no backfill points lower than the `current_depth` then
+ # either we can a) bail or b) still attempt to backfill. We opt to try
+ # backfilling anyway just in case we do get relevant events.
+ if not sorted_backfill_points and current_depth != MAX_DEPTH:
+ logger.debug(
+ "_maybe_backfill_inner: all backfill points are *after* current depth. Trying again with later backfill points."
+ )
+ return await self._maybe_backfill_inner(
+ room_id=room_id,
+ # We use `MAX_DEPTH` so that we find all backfill points next
+ # time (all events are below the `MAX_DEPTH`)
+ current_depth=MAX_DEPTH,
+ limit=limit,
+ # We don't want to start another timing observation from this
+ # nested recursive call. The top-most call can record the time
+ # overall otherwise the smaller one will throw off the results.
+ processing_start_time=None,
+ )
+
+ # Even after recursing with `MAX_DEPTH`, we didn't find any
+ # backward extremities to backfill from.
+ if not sorted_backfill_points:
+ logger.debug(
+ "_maybe_backfill_inner: Not backfilling as no backward extremeties found."
+ )
+ return False
+
# If we're approaching an extremity we trigger a backfill, otherwise we
# no-op.
#
@@ -255,47 +318,16 @@ class FederationHandler:
# chose more than one times the limit in case of failure, but choosing a
# much larger factor will result in triggering a backfill request much
# earlier than necessary.
- #
- # XXX: shouldn't we do this *after* the filter by depth below? Again, we don't
- # care about events that have happened after our current position.
- #
- max_depth = sorted_backfill_points[0].depth
- if current_depth - 2 * limit > max_depth:
+ max_depth_of_backfill_points = sorted_backfill_points[0].depth
+ if current_depth - 2 * limit > max_depth_of_backfill_points:
logger.debug(
"Not backfilling as we don't need to. %d < %d - 2 * %d",
- max_depth,
+ max_depth_of_backfill_points,
current_depth,
limit,
)
return False
- # We ignore extremities that have a greater depth than our current depth
- # as:
- # 1. we don't really care about getting events that have happened
- # after our current position; and
- # 2. we have likely previously tried and failed to backfill from that
- # extremity, so to avoid getting "stuck" requesting the same
- # backfill repeatedly we drop those extremities.
- #
- # However, we need to check that the filtered extremities are non-empty.
- # If they are empty then either we can a) bail or b) still attempt to
- # backfill. We opt to try backfilling anyway just in case we do get
- # relevant events.
- #
- filtered_sorted_backfill_points = [
- t for t in sorted_backfill_points if t.depth <= current_depth
- ]
- if filtered_sorted_backfill_points:
- logger.debug(
- "_maybe_backfill_inner: backfill points before current depth: %s",
- filtered_sorted_backfill_points,
- )
- sorted_backfill_points = filtered_sorted_backfill_points
- else:
- logger.debug(
- "_maybe_backfill_inner: all backfill points are *after* current depth. Backfilling anyway."
- )
-
# For performance's sake, we only want to paginate from a particular extremity
# if we can actually see the events we'll get. Otherwise, we'd just spend a lot
# of resources to get redacted events. We check each extremity in turn and
@@ -347,6 +379,7 @@ class FederationHandler:
filtered_extremities = await filter_events_for_server(
self._storage_controllers,
self.server_name,
+ self.server_name,
events_to_check,
redact=False,
check_history_visibility_only=True,
@@ -368,23 +401,40 @@ class FederationHandler:
logger.debug(
"_maybe_backfill_inner: extremities_to_request %s", extremities_to_request
)
+ set_tag(
+ SynapseTags.RESULT_PREFIX + "extremities_to_request",
+ str(extremities_to_request),
+ )
+ set_tag(
+ SynapseTags.RESULT_PREFIX + "extremities_to_request.length",
+ str(len(extremities_to_request)),
+ )
# Now we need to decide which hosts to hit first.
-
- # First we try hosts that are already in the room
+ # First we try hosts that are already in the room.
# TODO: HEURISTIC ALERT.
+ likely_domains = (
+ await self._storage_controllers.state.get_current_hosts_in_room_ordered(
+ room_id
+ )
+ )
- curr_state = await self._storage_controllers.state.get_current_state(room_id)
+ async def try_backfill(domains: Collection[str]) -> bool:
+ # TODO: Should we try multiple of these at a time?
- curr_domains = get_domains_from_state(curr_state)
+ # Number of contacted remote homeservers that have denied our backfill
+ # request with a 4xx code.
+ denied_count = 0
- likely_domains = [
- domain for domain, depth in curr_domains if domain != self.server_name
- ]
+ # Maximum number of contacted remote homeservers that can deny our
+ # backfill request with 4xx codes before we give up.
+ max_denied_count = 5
- async def try_backfill(domains: List[str]) -> bool:
- # TODO: Should we try multiple of these at a time?
for dom in domains:
+ # We don't want to ask our own server for information we don't have
+ if dom == self.server_name:
+ continue
+
try:
await self._federation_event_handler.backfill(
dom, room_id, limit=100, extremities=extremities_to_request
@@ -393,36 +443,69 @@ class FederationHandler:
# appropriate stuff.
# TODO: We can probably do something more intelligent here.
return True
+ except NotRetryingDestination as e:
+ logger.info("_maybe_backfill_inner: %s", e)
+ continue
+ except FederationDeniedError:
+ logger.info(
+ "_maybe_backfill_inner: Not attempting to backfill from %s because the homeserver is not on our federation whitelist",
+ dom,
+ )
+ continue
except (SynapseError, InvalidResponseError) as e:
logger.info("Failed to backfill from %s because %s", dom, e)
continue
except HttpResponseException as e:
if 400 <= e.code < 500:
- raise e.to_synapse_error()
+ logger.warning(
+ "Backfill denied from %s because %s [%d/%d]",
+ dom,
+ e,
+ denied_count,
+ max_denied_count,
+ )
+ denied_count += 1
+ if denied_count >= max_denied_count:
+ return False
+ continue
logger.info("Failed to backfill from %s because %s", dom, e)
continue
except CodeMessageException as e:
if 400 <= e.code < 500:
- raise
+ logger.warning(
+ "Backfill denied from %s because %s [%d/%d]",
+ dom,
+ e,
+ denied_count,
+ max_denied_count,
+ )
+ denied_count += 1
+ if denied_count >= max_denied_count:
+ return False
+ continue
logger.info("Failed to backfill from %s because %s", dom, e)
continue
- except NotRetryingDestination as e:
- logger.info(str(e))
- continue
except RequestSendFailed as e:
logger.info("Failed to get backfill from %s because %s", dom, e)
continue
- except FederationDeniedError as e:
- logger.info(e)
- continue
except Exception as e:
logger.exception("Failed to backfill from %s because %s", dom, e)
continue
return False
+ # If we have the `processing_start_time`, then we can make an
+ # observation. We wouldn't have the `processing_start_time` in the case
+ # where `_maybe_backfill_inner` is recursively called to find any
+ # backfill points regardless of `current_depth`.
+ if processing_start_time is not None:
+ processing_end_time = self.clock.time_msec()
+ backfill_processing_before_timer.observe(
+ (processing_end_time - processing_start_time) / 1000
+ )
+
success = await try_backfill(likely_domains)
if success:
return True
@@ -549,7 +632,12 @@ class FederationHandler:
# Mark the room as having partial state.
# The background process is responsible for unmarking this flag,
# even if the join fails.
- await self.store.store_partial_state_room(room_id, ret.servers_in_room)
+ await self.store.store_partial_state_room(
+ room_id=room_id,
+ servers=ret.servers_in_room,
+ device_lists_stream_id=self.store.get_device_stream_token(),
+ joined_via=origin,
+ )
try:
max_stream_id = (
@@ -574,6 +662,14 @@ class FederationHandler:
room_id,
)
raise LimitExceededError(msg=e.msg, errcode=e.errcode, retry_after_ms=0)
+ else:
+ # Record the join event id for future use (when we finish the full
+ # join). We have to do this after persisting the event to keep foreign
+ # key constraints intact.
+ if ret.partial_state:
+ await self.store.write_partial_state_rooms_join_event_id(
+ room_id, event.event_id
+ )
finally:
# Always kick off the background process that asynchronously fetches
# state for the room.
@@ -691,15 +787,27 @@ class FederationHandler:
# Send the signed event back to the room, and potentially receive some
# further information about the room in the form of partial state events
- stripped_room_state = await self.federation_client.send_knock(
- target_hosts, event
- )
+ knock_response = await self.federation_client.send_knock(target_hosts, event)
# Store any stripped room state events in the "unsigned" key of the event.
# This is a bit of a hack and is cribbing off of invites. Basically we
# store the room state here and retrieve it again when this event appears
# in the invitee's sync stream. It is stripped out for all other local users.
- event.unsigned["knock_room_state"] = stripped_room_state["knock_state_events"]
+ stripped_room_state = (
+ knock_response.get("knock_room_state")
+ # Since v1.37, Synapse incorrectly used "knock_state_events" for this field.
+ # Thus, we also check for a 'knock_state_events' to support old instances.
+ # See https://github.com/matrix-org/synapse/issues/14088.
+ or knock_response.get("knock_state_events")
+ )
+
+ if stripped_room_state is None:
+ raise KeyError(
+ "Missing 'knock_room_state' (or legacy 'knock_state_events') field in "
+ "send_knock response"
+ )
+
+ event.unsigned["knock_room_state"] = stripped_room_state
context = EventContext.for_outlier(self._storage_controllers)
stream_id = await self._federation_event_handler.persist_events_and_notify(
@@ -752,8 +860,25 @@ class FederationHandler:
# (and return a 404 otherwise)
room_version = await self.store.get_room_version(room_id)
+ if await self.store.is_partial_state_room(room_id):
+ # If our server is still only partially joined, we can't give a complete
+ # response to /make_join, so return a 404 as we would if we weren't in the
+ # room at all.
+ # The main reason we can't respond properly is that we need to know about
+ # the auth events for the join event that we would return.
+ # We also should not bother entertaining the /make_join since we cannot
+ # handle the /send_join.
+ logger.info(
+ "Rejecting /make_join to %s because it's a partial state room", room_id
+ )
+ raise SynapseError(
+ 404,
+ "Unable to handle /make_join right now; this server is not fully joined.",
+ errcode=Codes.NOT_FOUND,
+ )
+
# now check that we are *still* in the room
- is_in_room = await self._event_auth_handler.check_host_in_room(
+ is_in_room = await self._event_auth_handler.is_host_in_room(
room_id, self.server_name
)
if not is_in_room:
@@ -821,7 +946,7 @@ class FederationHandler:
# The remote hasn't signed it yet, obviously. We'll do the full checks
# when we get the event back in `on_send_join_request`
- await self._event_auth_handler.check_auth_rules_from_context(event, context)
+ await self._event_auth_handler.check_auth_rules_from_context(event)
return event
async def on_invite_request(
@@ -895,9 +1020,17 @@ class FederationHandler:
)
context = EventContext.for_outlier(self._storage_controllers)
- await self._federation_event_handler.persist_events_and_notify(
- event.room_id, [(event, context)]
+
+ await self._bulk_push_rule_evaluator.action_for_events_by_user(
+ [(event, context)]
)
+ try:
+ await self._federation_event_handler.persist_events_and_notify(
+ event.room_id, [(event, context)]
+ )
+ except Exception:
+ await self.store.remove_push_actions_from_staging(event.event_id)
+ raise
return event
@@ -996,7 +1129,7 @@ class FederationHandler:
try:
# The remote hasn't signed it yet, obviously. We'll do the full checks
# when we get the event back in `on_send_leave_request`
- await self._event_auth_handler.check_auth_rules_from_context(event, context)
+ await self._event_auth_handler.check_auth_rules_from_context(event)
except AuthError as e:
logger.warning("Failed to create new leave %r because %s", event, e)
raise e
@@ -1055,13 +1188,15 @@ class FederationHandler:
try:
# The remote hasn't signed it yet, obviously. We'll do the full checks
# when we get the event back in `on_send_knock_request`
- await self._event_auth_handler.check_auth_rules_from_context(event, context)
+ await self._event_auth_handler.check_auth_rules_from_context(event)
except AuthError as e:
logger.warning("Failed to create new knock %r because %s", event, e)
raise e
return event
+ @trace
+ @tag_args
async def get_state_ids_for_pdu(self, room_id: str, event_id: str) -> List[str]:
"""Returns the state at the event. i.e. not including said event."""
event = await self.store.get_event(event_id, check_room_id=room_id)
@@ -1097,9 +1232,9 @@ class FederationHandler:
async def on_backfill_request(
self, origin: str, room_id: str, pdu_list: List[str], limit: int
) -> List[EventBase]:
- in_room = await self._event_auth_handler.check_host_in_room(room_id, origin)
- if not in_room:
- raise AuthError(403, "Host not in room.")
+ # We allow partially joined rooms since in this case we are filtering out
+ # non-local events in `filter_events_for_server`.
+ await self._event_auth_handler.assert_host_in_room(room_id, origin, True)
# Synapse asks for 100 events per backfill request. Do not allow more.
limit = min(limit, 100)
@@ -1120,7 +1255,7 @@ class FederationHandler:
)
events = await filter_events_for_server(
- self._storage_controllers, origin, events
+ self._storage_controllers, origin, self.server_name, events
)
return events
@@ -1145,21 +1280,17 @@ class FederationHandler:
event_id, allow_none=True, allow_rejected=True
)
- if event:
- in_room = await self._event_auth_handler.check_host_in_room(
- event.room_id, origin
- )
- if not in_room:
- raise AuthError(403, "Host not in room.")
-
- events = await filter_events_for_server(
- self._storage_controllers, origin, [event]
- )
- event = events[0]
- return event
- else:
+ if not event:
return None
+ await self._event_auth_handler.assert_host_in_room(event.room_id, origin)
+
+ events = await filter_events_for_server(
+ self._storage_controllers, origin, self.server_name, [event]
+ )
+ event = events[0]
+ return event
+
async def on_get_missing_events(
self,
origin: str,
@@ -1168,9 +1299,9 @@ class FederationHandler:
latest_events: List[str],
limit: int,
) -> List[EventBase]:
- in_room = await self._event_auth_handler.check_host_in_room(room_id, origin)
- if not in_room:
- raise AuthError(403, "Host not in room.")
+ # We allow partially joined rooms since in this case we are filtering out
+ # non-local events in `filter_events_for_server`.
+ await self._event_auth_handler.assert_host_in_room(room_id, origin, True)
# Only allow up to 20 events to be retrieved per request.
limit = min(limit, 20)
@@ -1183,7 +1314,7 @@ class FederationHandler:
)
missing_events = await filter_events_for_server(
- self._storage_controllers, origin, missing_events
+ self._storage_controllers, origin, self.server_name, missing_events
)
return missing_events
@@ -1204,7 +1335,7 @@ class FederationHandler:
"state_key": target_user_id,
}
- if await self._event_auth_handler.check_host_in_room(room_id, self.hs.hostname):
+ if await self._event_auth_handler.is_host_in_room(room_id, self.hs.hostname):
room_version_obj = await self.store.get_room_version(room_id)
builder = self.event_builder_factory.for_room_version(
room_version_obj, event_dict
@@ -1227,9 +1358,7 @@ class FederationHandler:
try:
validate_event_for_room_version(event)
- await self._event_auth_handler.check_auth_rules_from_context(
- event, context
- )
+ await self._event_auth_handler.check_auth_rules_from_context(event)
except AuthError as e:
logger.warning("Denying new third party invite %r because %s", event, e)
raise e
@@ -1279,7 +1408,7 @@ class FederationHandler:
try:
validate_event_for_room_version(event)
- await self._event_auth_handler.check_auth_rules_from_context(event, context)
+ await self._event_auth_handler.check_auth_rules_from_context(event)
except AuthError as e:
logger.warning("Denying third party invite %r because %s", event, e)
raise e
@@ -1472,8 +1601,8 @@ class FederationHandler:
Fetch the complexity of a remote room over federation.
Args:
- remote_room_hosts (list[str]): The remote servers to ask.
- room_id (str): The room ID to ask about.
+ remote_room_hosts: The remote servers to ask.
+ room_id: The room ID to ask about.
Returns:
Dict contains the complexity
@@ -1495,13 +1624,13 @@ class FederationHandler:
"""Resumes resyncing of all partial-state rooms after a restart."""
assert not self.config.worker.worker_app
- partial_state_rooms = await self.store.get_partial_state_rooms_and_servers()
- for room_id, servers_in_room in partial_state_rooms.items():
+ partial_state_rooms = await self.store.get_partial_state_room_resync_info()
+ for room_id, resync_info in partial_state_rooms.items():
run_as_background_process(
desc="sync_partial_state_room",
func=self._sync_partial_state_room,
- initial_destination=None,
- other_destinations=servers_in_room,
+ initial_destination=resync_info.joined_via,
+ other_destinations=resync_info.servers_in_room,
room_id=room_id,
)
@@ -1530,28 +1659,12 @@ class FederationHandler:
# really leave, that might mean we have difficulty getting the room state over
# federation.
# https://github.com/matrix-org/synapse/issues/12802
- #
- # TODO(faster_joins): we need some way of prioritising which homeservers in
- # `other_destinations` to try first, otherwise we'll spend ages trying dead
- # homeservers for large rooms.
- # https://github.com/matrix-org/synapse/issues/12999
-
- if initial_destination is None and len(other_destinations) == 0:
- raise ValueError(
- f"Cannot resync state of {room_id}: no destinations provided"
- )
# Make an infinite iterator of destinations to try. Once we find a working
# destination, we'll stick with it until it flakes.
- destinations: Collection[str]
- if initial_destination is not None:
- # Move `initial_destination` to the front of the list.
- destinations = list(other_destinations)
- if initial_destination in destinations:
- destinations.remove(initial_destination)
- destinations = [initial_destination] + destinations
- else:
- destinations = other_destinations
+ destinations = _prioritise_destinations_for_partial_state_resync(
+ initial_destination, other_destinations, room_id
+ )
destination_iter = itertools.cycle(destinations)
# `destination` is the current remote homeserver we're pulling from.
@@ -1569,6 +1682,9 @@ class FederationHandler:
# https://github.com/matrix-org/synapse/issues/12994
await self.state_handler.update_current_state(room_id)
+ logger.info("Handling any pending device list updates")
+ await self._device_handler.handle_room_un_partial_stated(room_id)
+
logger.info("Clearing partial-state flag for %s", room_id)
success = await self.store.clear_partial_state_room(room_id)
if success:
@@ -1598,7 +1714,22 @@ class FederationHandler:
destination, event
)
break
+ except FederationPullAttemptBackoffError as exc:
+ # Log a warning about why we failed to process the event (the error message
+ # for `FederationPullAttemptBackoffError` is pretty good)
+ logger.warning("_sync_partial_state_room: %s", exc)
+ # We do not record a failed pull attempt when we backoff fetching a missing
+ # `prev_event` because not being able to fetch the `prev_events` just means
+ # we won't be able to de-outlier the pulled event. But we can still use an
+ # `outlier` in the state/auth chain for another event. So we shouldn't stop
+ # a downstream event from trying to pull it.
+ #
+ # This avoids a cascade of backoff for all events in the DAG downstream from
+ # one event backoff upstream.
except FederationError as e:
+ # TODO: We should `record_event_failed_pull_attempt` here,
+ # see https://github.com/matrix-org/synapse/issues/13700
+
if attempt == len(destinations) - 1:
# We have tried every remote server for this event. Give up.
# TODO(faster_joins) giving up isn't the right thing to do
@@ -1631,3 +1762,29 @@ class FederationHandler:
room_id,
destination,
)
+
+
+def _prioritise_destinations_for_partial_state_resync(
+ initial_destination: Optional[str],
+ other_destinations: Collection[str],
+ room_id: str,
+) -> Collection[str]:
+ """Work out the order in which we should ask servers to resync events.
+
+ If an `initial_destination` is given, it takes top priority. Otherwise
+ all servers are treated equally.
+
+ :raises ValueError: if no destination is provided at all.
+ """
+ if initial_destination is None and len(other_destinations) == 0:
+ raise ValueError(f"Cannot resync state of {room_id}: no destinations provided")
+
+ if initial_destination is None:
+ return other_destinations
+
+ # Move `initial_destination` to the front of the list.
+ destinations = list(other_destinations)
+ if initial_destination in destinations:
+ destinations.remove(initial_destination)
+ destinations = [initial_destination] + destinations
+ return destinations
|