diff --git a/changelog.d/14096.misc b/changelog.d/14096.misc
new file mode 100644
index 0000000000..2c07dc673b
--- /dev/null
+++ b/changelog.d/14096.misc
@@ -0,0 +1 @@
+Refactor [MSC3030](https://github.com/matrix-org/matrix-spec-proposals/pull/3030) `/timestamp_to_event` endpoint to loop over federation destinations with standard pattern and error handling.
diff --git a/synapse/federation/federation_client.py b/synapse/federation/federation_client.py
index b220ab43fc..fa225182be 100644
--- a/synapse/federation/federation_client.py
+++ b/synapse/federation/federation_client.py
@@ -80,6 +80,18 @@ PDU_RETRY_TIME_MS = 1 * 60 * 1000
T = TypeVar("T")
+@attr.s(frozen=True, slots=True, auto_attribs=True)
+class PulledPduInfo:
+ """
+ A result object that stores the PDU and info about it like which homeserver we
+ pulled it from (`pull_origin`)
+ """
+
+ pdu: EventBase
+ # Which homeserver we pulled the PDU from
+ pull_origin: str
+
+
class InvalidResponseError(RuntimeError):
"""Helper for _try_destination_list: indicates that the server returned a response
we couldn't parse
@@ -114,7 +126,9 @@ class FederationClient(FederationBase):
self.hostname = hs.hostname
self.signing_key = hs.signing_key
- self._get_pdu_cache: ExpiringCache[str, EventBase] = ExpiringCache(
+ # Cache mapping `event_id` to a tuple of the event itself and the `pull_origin`
+ # (which server we pulled the event from)
+ self._get_pdu_cache: ExpiringCache[str, Tuple[EventBase, str]] = ExpiringCache(
cache_name="get_pdu_cache",
clock=self._clock,
max_len=1000,
@@ -352,11 +366,11 @@ class FederationClient(FederationBase):
@tag_args
async def get_pdu(
self,
- destinations: Iterable[str],
+ destinations: Collection[str],
event_id: str,
room_version: RoomVersion,
timeout: Optional[int] = None,
- ) -> Optional[EventBase]:
+ ) -> Optional[PulledPduInfo]:
"""Requests the PDU with given origin and ID from the remote home
servers.
@@ -371,11 +385,11 @@ class FederationClient(FederationBase):
moving to the next destination. None indicates no timeout.
Returns:
- The requested PDU, or None if we were unable to find it.
+ The requested PDU wrapped in `PulledPduInfo`, or None if we were unable to find it.
"""
logger.debug(
- "get_pdu: event_id=%s from destinations=%s", event_id, destinations
+ "get_pdu(event_id=%s): from destinations=%s", event_id, destinations
)
# TODO: Rate limit the number of times we try and get the same event.
@@ -384,19 +398,25 @@ class FederationClient(FederationBase):
# it gets persisted to the database), so we cache the results of the lookup.
# Note that this is separate to the regular get_event cache which caches
# events once they have been persisted.
- event = self._get_pdu_cache.get(event_id)
+ get_pdu_cache_entry = self._get_pdu_cache.get(event_id)
+ event = None
+ pull_origin = None
+ if get_pdu_cache_entry:
+ event, pull_origin = get_pdu_cache_entry
# If we don't see the event in the cache, go try to fetch it from the
# provided remote federated destinations
- if not event:
+ else:
pdu_attempts = self.pdu_destination_tried.setdefault(event_id, {})
+ # TODO: We can probably refactor this to use `_try_destination_list`
for destination in destinations:
now = self._clock.time_msec()
last_attempt = pdu_attempts.get(destination, 0)
if last_attempt + PDU_RETRY_TIME_MS > now:
logger.debug(
- "get_pdu: skipping destination=%s because we tried it recently last_attempt=%s and we only check every %s (now=%s)",
+ "get_pdu(event_id=%s): skipping destination=%s because we tried it recently last_attempt=%s and we only check every %s (now=%s)",
+ event_id,
destination,
last_attempt,
PDU_RETRY_TIME_MS,
@@ -411,43 +431,48 @@ class FederationClient(FederationBase):
room_version=room_version,
timeout=timeout,
)
+ pull_origin = destination
pdu_attempts[destination] = now
if event:
# Prime the cache
- self._get_pdu_cache[event.event_id] = event
+ self._get_pdu_cache[event.event_id] = (event, pull_origin)
# Now that we have an event, we can break out of this
# loop and stop asking other destinations.
break
+ except NotRetryingDestination as e:
+ logger.info("get_pdu(event_id=%s): %s", event_id, e)
+ continue
+ except FederationDeniedError:
+ logger.info(
+ "get_pdu(event_id=%s): Not attempting to fetch PDU from %s because the homeserver is not on our federation whitelist",
+ event_id,
+ destination,
+ )
+ continue
except SynapseError as e:
logger.info(
- "Failed to get PDU %s from %s because %s",
+ "get_pdu(event_id=%s): Failed to get PDU from %s because %s",
event_id,
destination,
e,
)
continue
- except NotRetryingDestination as e:
- logger.info(str(e))
- continue
- except FederationDeniedError as e:
- logger.info(str(e))
- continue
except Exception as e:
pdu_attempts[destination] = now
logger.info(
- "Failed to get PDU %s from %s because %s",
+ "get_pdu(event_id=): Failed to get PDU from %s because %s",
event_id,
destination,
e,
)
continue
- if not event:
+ if not event or not pull_origin:
return None
# `event` now refers to an object stored in `get_pdu_cache`. Our
@@ -459,7 +484,7 @@ class FederationClient(FederationBase):
event.room_version,
)
- return event_copy
+ return PulledPduInfo(event_copy, pull_origin)
@trace
@tag_args
@@ -699,12 +724,14 @@ class FederationClient(FederationBase):
pdu_origin = get_domain_from_id(pdu.sender)
if not res and pdu_origin != origin:
try:
- res = await self.get_pdu(
+ pulled_pdu_info = await self.get_pdu(
destinations=[pdu_origin],
event_id=pdu.event_id,
room_version=room_version,
timeout=10000,
)
+ if pulled_pdu_info is not None:
+ res = pulled_pdu_info.pdu
except SynapseError:
pass
@@ -806,6 +833,7 @@ class FederationClient(FederationBase):
)
for destination in destinations:
+ # We don't want to ask our own server for information we don't have
if destination == self.server_name:
continue
@@ -814,9 +842,21 @@ class FederationClient(FederationBase):
except (
RequestSendFailed,
InvalidResponseError,
- NotRetryingDestination,
) as e:
logger.warning("Failed to %s via %s: %s", description, destination, e)
+ # Skip to the next homeserver in the list to try.
+ continue
+ except NotRetryingDestination as e:
+ logger.info("%s: %s", description, e)
+ continue
+ except FederationDeniedError:
+ logger.info(
+ "%s: Not attempting to %s from %s because the homeserver is not on our federation whitelist",
+ description,
+ description,
+ destination,
+ )
+ continue
except UnsupportedRoomVersionError:
raise
except HttpResponseException as e:
@@ -1609,6 +1649,54 @@ class FederationClient(FederationBase):
return result
async def timestamp_to_event(
+ self, *, destinations: List[str], room_id: str, timestamp: int, direction: str
+ ) -> Optional["TimestampToEventResponse"]:
+ """
+ Calls each remote federating server from `destinations` asking for their closest
+ event to the given timestamp in the given direction until we get a response.
+ Also validates the response to always return the expected keys or raises an
+ error.
+
+ Args:
+ destinations: The domains of homeservers to try fetching from
+ room_id: Room to fetch the event from
+ timestamp: The point in time (inclusive) we should navigate from in
+ the given direction to find the closest event.
+ direction: ["f"|"b"] to indicate whether we should navigate forward
+ or backward from the given timestamp to find the closest event.
+
+ Returns:
+ A parsed TimestampToEventResponse including the closest event_id
+ and origin_server_ts or None if no destination has a response.
+ """
+
+ async def _timestamp_to_event_from_destination(
+ destination: str,
+ ) -> TimestampToEventResponse:
+ return await self._timestamp_to_event_from_destination(
+ destination, room_id, timestamp, direction
+ )
+
+ try:
+ # Loop through each homeserver candidate until we get a succesful response
+ timestamp_to_event_response = await self._try_destination_list(
+ "timestamp_to_event",
+ destinations,
+ # TODO: The requested timestamp may lie in a part of the
+ # event graph that the remote server *also* didn't have,
+ # in which case they will have returned another event
+ # which may be nowhere near the requested timestamp. In
+ # the future, we may need to reconcile that gap and ask
+ # other homeservers, and/or extend `/timestamp_to_event`
+ # to return events on *both* sides of the timestamp to
+ # help reconcile the gap faster.
+ _timestamp_to_event_from_destination,
+ )
+ return timestamp_to_event_response
+ except SynapseError:
+ return None
+
+ async def _timestamp_to_event_from_destination(
self, destination: str, room_id: str, timestamp: int, direction: str
) -> "TimestampToEventResponse":
"""
diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py
index 4fbc79a6cb..5fc3b8bc8c 100644
--- a/synapse/handlers/federation.py
+++ b/synapse/handlers/federation.py
@@ -442,6 +442,15 @@ class FederationHandler:
# appropriate stuff.
# TODO: We can probably do something more intelligent here.
return True
+ except NotRetryingDestination as e:
+ logger.info("_maybe_backfill_inner: %s", e)
+ continue
+ except FederationDeniedError:
+ logger.info(
+ "_maybe_backfill_inner: Not attempting to backfill from %s because the homeserver is not on our federation whitelist",
+ dom,
+ )
+ continue
except (SynapseError, InvalidResponseError) as e:
logger.info("Failed to backfill from %s because %s", dom, e)
continue
@@ -477,15 +486,9 @@ class FederationHandler:
logger.info("Failed to backfill from %s because %s", dom, e)
continue
- except NotRetryingDestination as e:
- logger.info(str(e))
- continue
except RequestSendFailed as e:
logger.info("Failed to get backfill from %s because %s", dom, e)
continue
- except FederationDeniedError as e:
- logger.info(e)
- continue
except Exception as e:
logger.exception("Failed to backfill from %s because %s", dom, e)
continue
diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py
index 7da6316a82..9ca5df7c78 100644
--- a/synapse/handlers/federation_event.py
+++ b/synapse/handlers/federation_event.py
@@ -58,7 +58,7 @@ from synapse.event_auth import (
)
from synapse.events import EventBase
from synapse.events.snapshot import EventContext
-from synapse.federation.federation_client import InvalidResponseError
+from synapse.federation.federation_client import InvalidResponseError, PulledPduInfo
from synapse.logging.context import nested_logging_context
from synapse.logging.opentracing import (
SynapseTags,
@@ -1517,8 +1517,8 @@ class FederationEventHandler:
)
async def backfill_event_id(
- self, destination: str, room_id: str, event_id: str
- ) -> EventBase:
+ self, destinations: List[str], room_id: str, event_id: str
+ ) -> PulledPduInfo:
"""Backfill a single event and persist it as a non-outlier which means
we also pull in all of the state and auth events necessary for it.
@@ -1530,24 +1530,21 @@ class FederationEventHandler:
Raises:
FederationError if we are unable to find the event from the destination
"""
- logger.info(
- "backfill_event_id: event_id=%s from destination=%s", event_id, destination
- )
+ logger.info("backfill_event_id: event_id=%s", event_id)
room_version = await self._store.get_room_version(room_id)
- event_from_response = await self._federation_client.get_pdu(
- [destination],
+ pulled_pdu_info = await self._federation_client.get_pdu(
+ destinations,
event_id,
room_version,
)
- if not event_from_response:
+ if not pulled_pdu_info:
raise FederationError(
"ERROR",
404,
- "Unable to find event_id=%s from destination=%s to backfill."
- % (event_id, destination),
+ f"Unable to find event_id={event_id} from remote servers to backfill.",
affected=event_id,
)
@@ -1555,13 +1552,13 @@ class FederationEventHandler:
# and auth events to de-outlier it. This also sets up the necessary
# `state_groups` for the event.
await self._process_pulled_events(
- destination,
- [event_from_response],
+ pulled_pdu_info.pull_origin,
+ [pulled_pdu_info.pdu],
# Prevent notifications going to clients
backfilled=True,
)
- return event_from_response
+ return pulled_pdu_info
@trace
@tag_args
@@ -1584,19 +1581,19 @@ class FederationEventHandler:
async def get_event(event_id: str) -> None:
with nested_logging_context(event_id):
try:
- event = await self._federation_client.get_pdu(
+ pulled_pdu_info = await self._federation_client.get_pdu(
[destination],
event_id,
room_version,
)
- if event is None:
+ if pulled_pdu_info is None:
logger.warning(
"Server %s didn't return event %s",
destination,
event_id,
)
return
- events.append(event)
+ events.append(pulled_pdu_info.pdu)
except Exception as e:
logger.warning(
diff --git a/synapse/handlers/room.py b/synapse/handlers/room.py
index cc1e5c8f97..de97886ea9 100644
--- a/synapse/handlers/room.py
+++ b/synapse/handlers/room.py
@@ -49,7 +49,6 @@ from synapse.api.constants import (
from synapse.api.errors import (
AuthError,
Codes,
- HttpResponseException,
LimitExceededError,
NotFoundError,
StoreError,
@@ -60,7 +59,6 @@ from synapse.api.room_versions import KNOWN_ROOM_VERSIONS, RoomVersion
from synapse.event_auth import validate_event_for_room_version
from synapse.events import EventBase
from synapse.events.utils import copy_and_fixup_power_levels_contents
-from synapse.federation.federation_client import InvalidResponseError
from synapse.handlers.relations import BundledAggregations
from synapse.module_api import NOT_SPAM
from synapse.rest.admin._base import assert_user_is_admin
@@ -1472,7 +1470,12 @@ class TimestampLookupHandler:
Raises:
SynapseError if unable to find any event locally in the given direction
"""
-
+ logger.debug(
+ "get_event_for_timestamp(room_id=%s, timestamp=%s, direction=%s) Finding closest event...",
+ room_id,
+ timestamp,
+ direction,
+ )
local_event_id = await self.store.get_event_id_for_timestamp(
room_id, timestamp, direction
)
@@ -1524,85 +1527,54 @@ class TimestampLookupHandler:
)
)
- # Loop through each homeserver candidate until we get a succesful response
- for domain in likely_domains:
- # We don't want to ask our own server for information we don't have
- if domain == self.server_name:
- continue
+ remote_response = await self.federation_client.timestamp_to_event(
+ destinations=likely_domains,
+ room_id=room_id,
+ timestamp=timestamp,
+ direction=direction,
+ )
+ if remote_response is not None:
+ logger.debug(
+ "get_event_for_timestamp: remote_response=%s",
+ remote_response,
+ )
- try:
- remote_response = await self.federation_client.timestamp_to_event(
- domain, room_id, timestamp, direction
- )
- logger.debug(
- "get_event_for_timestamp: response from domain(%s)=%s",
- domain,
- remote_response,
- )
+ remote_event_id = remote_response.event_id
+ remote_origin_server_ts = remote_response.origin_server_ts
- remote_event_id = remote_response.event_id
- remote_origin_server_ts = remote_response.origin_server_ts
-
- # Backfill this event so we can get a pagination token for
- # it with `/context` and paginate `/messages` from this
- # point.
- #
- # TODO: The requested timestamp may lie in a part of the
- # event graph that the remote server *also* didn't have,
- # in which case they will have returned another event
- # which may be nowhere near the requested timestamp. In
- # the future, we may need to reconcile that gap and ask
- # other homeservers, and/or extend `/timestamp_to_event`
- # to return events on *both* sides of the timestamp to
- # help reconcile the gap faster.
- remote_event = (
- await self.federation_event_handler.backfill_event_id(
- domain, room_id, remote_event_id
- )
- )
+ # Backfill this event so we can get a pagination token for
+ # it with `/context` and paginate `/messages` from this
+ # point.
+ pulled_pdu_info = await self.federation_event_handler.backfill_event_id(
+ likely_domains, room_id, remote_event_id
+ )
+ remote_event = pulled_pdu_info.pdu
- # XXX: When we see that the remote server is not trustworthy,
- # maybe we should not ask them first in the future.
- if remote_origin_server_ts != remote_event.origin_server_ts:
- logger.info(
- "get_event_for_timestamp: Remote server (%s) claimed that remote_event_id=%s occured at remote_origin_server_ts=%s but that isn't true (actually occured at %s). Their claims are dubious and we should consider not trusting them.",
- domain,
- remote_event_id,
- remote_origin_server_ts,
- remote_event.origin_server_ts,
- )
-
- # Only return the remote event if it's closer than the local event
- if not local_event or (
- abs(remote_event.origin_server_ts - timestamp)
- < abs(local_event.origin_server_ts - timestamp)
- ):
- logger.info(
- "get_event_for_timestamp: returning remote_event_id=%s (%s) since it's closer to timestamp=%s than local_event=%s (%s)",
- remote_event_id,
- remote_event.origin_server_ts,
- timestamp,
- local_event.event_id if local_event else None,
- local_event.origin_server_ts if local_event else None,
- )
- return remote_event_id, remote_origin_server_ts
- except (HttpResponseException, InvalidResponseError) as ex:
- # Let's not put a high priority on some other homeserver
- # failing to respond or giving a random response
- logger.debug(
- "get_event_for_timestamp: Failed to fetch /timestamp_to_event from %s because of exception(%s) %s args=%s",
- domain,
- type(ex).__name__,
- ex,
- ex.args,
+ # XXX: When we see that the remote server is not trustworthy,
+ # maybe we should not ask them first in the future.
+ if remote_origin_server_ts != remote_event.origin_server_ts:
+ logger.info(
+ "get_event_for_timestamp: Remote server (%s) claimed that remote_event_id=%s occured at remote_origin_server_ts=%s but that isn't true (actually occured at %s). Their claims are dubious and we should consider not trusting them.",
+ pulled_pdu_info.pull_origin,
+ remote_event_id,
+ remote_origin_server_ts,
+ remote_event.origin_server_ts,
)
- except Exception:
- # But we do want to see some exceptions in our code
- logger.warning(
- "get_event_for_timestamp: Failed to fetch /timestamp_to_event from %s because of exception",
- domain,
- exc_info=True,
+
+ # Only return the remote event if it's closer than the local event
+ if not local_event or (
+ abs(remote_event.origin_server_ts - timestamp)
+ < abs(local_event.origin_server_ts - timestamp)
+ ):
+ logger.info(
+ "get_event_for_timestamp: returning remote_event_id=%s (%s) since it's closer to timestamp=%s than local_event=%s (%s)",
+ remote_event_id,
+ remote_event.origin_server_ts,
+ timestamp,
+ local_event.event_id if local_event else None,
+ local_event.origin_server_ts if local_event else None,
)
+ return remote_event_id, remote_origin_server_ts
# To appease mypy, we have to add both of these conditions to check for
# `None`. We only expect `local_event` to be `None` when
diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py
index d0a69ff843..dcc037b982 100644
--- a/synapse/util/retryutils.py
+++ b/synapse/util/retryutils.py
@@ -51,7 +51,7 @@ class NotRetryingDestination(Exception):
destination: the domain in question
"""
- msg = "Not retrying server %s." % (destination,)
+ msg = f"Not retrying server {destination} because we tried it recently retry_last_ts={retry_last_ts} and we won't check for another retry_interval={retry_interval}ms."
super().__init__(msg)
self.retry_last_ts = retry_last_ts
diff --git a/tests/federation/test_federation_client.py b/tests/federation/test_federation_client.py
index 51d3bb8fff..e67f405826 100644
--- a/tests/federation/test_federation_client.py
+++ b/tests/federation/test_federation_client.py
@@ -142,14 +142,14 @@ class FederationClientTest(FederatingHomeserverTestCase):
def test_get_pdu_returns_nothing_when_event_does_not_exist(self):
"""No event should be returned when the event does not exist"""
- remote_pdu = self.get_success(
+ pulled_pdu_info = self.get_success(
self.hs.get_federation_client().get_pdu(
["yet.another.server"],
"event_should_not_exist",
RoomVersions.V9,
)
)
- self.assertEqual(remote_pdu, None)
+ self.assertEqual(pulled_pdu_info, None)
def test_get_pdu(self):
"""Test to make sure an event is returned by `get_pdu()`"""
@@ -169,13 +169,15 @@ class FederationClientTest(FederatingHomeserverTestCase):
remote_pdu.internal_metadata.outlier = True
# Get the event again. This time it should read it from cache.
- remote_pdu2 = self.get_success(
+ pulled_pdu_info2 = self.get_success(
self.hs.get_federation_client().get_pdu(
["yet.another.server"],
remote_pdu.event_id,
RoomVersions.V9,
)
)
+ self.assertIsNotNone(pulled_pdu_info2)
+ remote_pdu2 = pulled_pdu_info2.pdu
# Sanity check that we are working against the same event
self.assertEqual(remote_pdu.event_id, remote_pdu2.event_id)
@@ -215,13 +217,15 @@ class FederationClientTest(FederatingHomeserverTestCase):
)
)
- remote_pdu = self.get_success(
+ pulled_pdu_info = self.get_success(
self.hs.get_federation_client().get_pdu(
["yet.another.server"],
"event_id",
RoomVersions.V9,
)
)
+ self.assertIsNotNone(pulled_pdu_info)
+ remote_pdu = pulled_pdu_info.pdu
# check the right call got made to the agent
self._mock_agent.request.assert_called_once_with(
|