diff --git a/changelog.d/6016.misc b/changelog.d/6016.misc
new file mode 100644
index 0000000000..91cf164714
--- /dev/null
+++ b/changelog.d/6016.misc
@@ -0,0 +1 @@
+Add a 'failure_ts' column to the 'destinations' database table.
diff --git a/synapse/storage/schema/delta/56/destinations_failure_ts.sql b/synapse/storage/schema/delta/56/destinations_failure_ts.sql
new file mode 100644
index 0000000000..f00889290b
--- /dev/null
+++ b/synapse/storage/schema/delta/56/destinations_failure_ts.sql
@@ -0,0 +1,25 @@
+/* Copyright 2019 The Matrix.org Foundation C.I.C
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Record the timestamp when a given server started failing
+ */
+ALTER TABLE destinations ADD failure_ts BIGINT;
+
+/* as a rough approximation, we assume that the server started failing at
+ * retry_interval before the last retry
+ */
+UPDATE destinations SET failure_ts = retry_last_ts - retry_interval
+ WHERE retry_last_ts > 0;
diff --git a/synapse/storage/transactions.py b/synapse/storage/transactions.py
index d81ace0ece..289c117396 100644
--- a/synapse/storage/transactions.py
+++ b/synapse/storage/transactions.py
@@ -165,7 +165,7 @@ class TransactionStore(SQLBaseStore):
txn,
table="destinations",
keyvalues={"destination": destination},
- retcols=("destination", "retry_last_ts", "retry_interval"),
+ retcols=("destination", "failure_ts", "retry_last_ts", "retry_interval"),
allow_none=True,
)
@@ -174,12 +174,15 @@ class TransactionStore(SQLBaseStore):
else:
return None
- def set_destination_retry_timings(self, destination, retry_last_ts, retry_interval):
+ def set_destination_retry_timings(
+ self, destination, failure_ts, retry_last_ts, retry_interval
+ ):
"""Sets the current retry timings for a given destination.
Both timings should be zero if retrying is no longer occuring.
Args:
destination (str)
+ failure_ts (int|None) - when the server started failing (ms since epoch)
retry_last_ts (int) - time of last retry attempt in unix epoch ms
retry_interval (int) - how long until next retry in ms
"""
@@ -189,12 +192,13 @@ class TransactionStore(SQLBaseStore):
"set_destination_retry_timings",
self._set_destination_retry_timings,
destination,
+ failure_ts,
retry_last_ts,
retry_interval,
)
def _set_destination_retry_timings(
- self, txn, destination, retry_last_ts, retry_interval
+ self, txn, destination, failure_ts, retry_last_ts, retry_interval
):
if self.database_engine.can_native_upsert:
@@ -202,9 +206,12 @@ class TransactionStore(SQLBaseStore):
# resetting it) or greater than the existing retry interval.
sql = """
- INSERT INTO destinations (destination, retry_last_ts, retry_interval)
- VALUES (?, ?, ?)
+ INSERT INTO destinations (
+ destination, failure_ts, retry_last_ts, retry_interval
+ )
+ VALUES (?, ?, ?, ?)
ON CONFLICT (destination) DO UPDATE SET
+ failure_ts = EXCLUDED.failure_ts,
retry_last_ts = EXCLUDED.retry_last_ts,
retry_interval = EXCLUDED.retry_interval
WHERE
@@ -212,7 +219,7 @@ class TransactionStore(SQLBaseStore):
OR destinations.retry_interval < EXCLUDED.retry_interval
"""
- txn.execute(sql, (destination, retry_last_ts, retry_interval))
+ txn.execute(sql, (destination, failure_ts, retry_last_ts, retry_interval))
return
@@ -225,7 +232,7 @@ class TransactionStore(SQLBaseStore):
txn,
table="destinations",
keyvalues={"destination": destination},
- retcols=("retry_last_ts", "retry_interval"),
+ retcols=("failure_ts", "retry_last_ts", "retry_interval"),
allow_none=True,
)
@@ -235,6 +242,7 @@ class TransactionStore(SQLBaseStore):
table="destinations",
values={
"destination": destination,
+ "failure_ts": failure_ts,
"retry_last_ts": retry_last_ts,
"retry_interval": retry_interval,
},
@@ -245,6 +253,7 @@ class TransactionStore(SQLBaseStore):
"destinations",
keyvalues={"destination": destination},
updatevalues={
+ "failure_ts": failure_ts,
"retry_last_ts": retry_last_ts,
"retry_interval": retry_interval,
},
diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py
index b740913b58..a5f2fbef5c 100644
--- a/synapse/util/retryutils.py
+++ b/synapse/util/retryutils.py
@@ -80,11 +80,13 @@ def get_retry_limiter(destination, clock, store, ignore_backoff=False, **kwargs)
# We aren't ready to retry that destination.
raise
"""
+ failure_ts = None
retry_last_ts, retry_interval = (0, 0)
retry_timings = yield store.get_destination_retry_timings(destination)
if retry_timings:
+ failure_ts = retry_timings["failure_ts"]
retry_last_ts, retry_interval = (
retry_timings["retry_last_ts"],
retry_timings["retry_interval"],
@@ -108,6 +110,7 @@ def get_retry_limiter(destination, clock, store, ignore_backoff=False, **kwargs)
destination,
clock,
store,
+ failure_ts,
retry_interval,
backoff_on_failure=backoff_on_failure,
**kwargs
@@ -120,6 +123,7 @@ class RetryDestinationLimiter(object):
destination,
clock,
store,
+ failure_ts,
retry_interval,
backoff_on_404=False,
backoff_on_failure=True,
@@ -133,6 +137,8 @@ class RetryDestinationLimiter(object):
destination (str)
clock (Clock)
store (DataStore)
+ failure_ts (int|None): when this destination started failing (in ms since
+ the epoch), or zero if the last request was successful
retry_interval (int): The next retry interval taken from the
database in milliseconds, or zero if the last request was
successful.
@@ -145,6 +151,7 @@ class RetryDestinationLimiter(object):
self.store = store
self.destination = destination
+ self.failure_ts = failure_ts
self.retry_interval = retry_interval
self.backoff_on_404 = backoff_on_404
self.backoff_on_failure = backoff_on_failure
@@ -186,6 +193,7 @@ class RetryDestinationLimiter(object):
logger.debug(
"Connection to %s was successful; clearing backoff", self.destination
)
+ self.failure_ts = None
retry_last_ts = 0
self.retry_interval = 0
elif not self.backoff_on_failure:
@@ -211,11 +219,17 @@ class RetryDestinationLimiter(object):
)
retry_last_ts = int(self.clock.time_msec())
+ if self.failure_ts is None:
+ self.failure_ts = retry_last_ts
+
@defer.inlineCallbacks
def store_retry_timings():
try:
yield self.store.set_destination_retry_timings(
- self.destination, retry_last_ts, self.retry_interval
+ self.destination,
+ self.failure_ts,
+ retry_last_ts,
+ self.retry_interval,
)
except Exception:
logger.exception("Failed to store destination_retry_timings")
diff --git a/tests/handlers/test_typing.py b/tests/handlers/test_typing.py
index 5d5e324df2..1f2ef5d01f 100644
--- a/tests/handlers/test_typing.py
+++ b/tests/handlers/test_typing.py
@@ -99,7 +99,12 @@ class TypingNotificationsTestCase(unittest.HomeserverTestCase):
self.event_source = hs.get_event_sources().sources["typing"]
self.datastore = hs.get_datastore()
- retry_timings_res = {"destination": "", "retry_last_ts": 0, "retry_interval": 0}
+ retry_timings_res = {
+ "destination": "",
+ "retry_last_ts": 0,
+ "retry_interval": 0,
+ "failure_ts": None,
+ }
self.datastore.get_destination_retry_timings.return_value = defer.succeed(
retry_timings_res
)
diff --git a/tests/storage/test_transactions.py b/tests/storage/test_transactions.py
index 14169afa96..a771d5af29 100644
--- a/tests/storage/test_transactions.py
+++ b/tests/storage/test_transactions.py
@@ -29,17 +29,19 @@ class TransactionStoreTestCase(HomeserverTestCase):
r = self.get_success(d)
self.assertIsNone(r)
- d = self.store.set_destination_retry_timings("example.com", 50, 100)
+ d = self.store.set_destination_retry_timings("example.com", 1000, 50, 100)
self.get_success(d)
d = self.store.get_destination_retry_timings("example.com")
r = self.get_success(d)
- self.assert_dict({"retry_last_ts": 50, "retry_interval": 100}, r)
+ self.assert_dict(
+ {"retry_last_ts": 50, "retry_interval": 100, "failure_ts": 1000}, r
+ )
def test_initial_set_transactions(self):
"""Tests that we can successfully set the destination retries (there
was a bug around invalidating the cache that broke this)
"""
- d = self.store.set_destination_retry_timings("example.com", 50, 100)
+ d = self.store.set_destination_retry_timings("example.com", 1000, 50, 100)
self.get_success(d)
diff --git a/tests/util/test_retryutils.py b/tests/util/test_retryutils.py
new file mode 100644
index 0000000000..9e348694ad
--- /dev/null
+++ b/tests/util/test_retryutils.py
@@ -0,0 +1,127 @@
+# -*- coding: utf-8 -*-
+# Copyright 2019 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from synapse.util.retryutils import (
+ MIN_RETRY_INTERVAL,
+ RETRY_MULTIPLIER,
+ NotRetryingDestination,
+ get_retry_limiter,
+)
+
+from tests.unittest import HomeserverTestCase
+
+
+class RetryLimiterTestCase(HomeserverTestCase):
+ def test_new_destination(self):
+ """A happy-path case with a new destination and a successful operation"""
+ store = self.hs.get_datastore()
+ d = get_retry_limiter("test_dest", self.clock, store)
+ self.pump()
+ limiter = self.successResultOf(d)
+
+ # advance the clock a bit before making the request
+ self.pump(1)
+
+ with limiter:
+ pass
+
+ d = store.get_destination_retry_timings("test_dest")
+ self.pump()
+ new_timings = self.successResultOf(d)
+ self.assertIsNone(new_timings)
+
+ def test_limiter(self):
+ """General test case which walks through the process of a failing request"""
+ store = self.hs.get_datastore()
+
+ d = get_retry_limiter("test_dest", self.clock, store)
+ self.pump()
+ limiter = self.successResultOf(d)
+
+ self.pump(1)
+ try:
+ with limiter:
+ self.pump(1)
+ failure_ts = self.clock.time_msec()
+ raise AssertionError("argh")
+ except AssertionError:
+ pass
+
+ # wait for the update to land
+ self.pump()
+
+ d = store.get_destination_retry_timings("test_dest")
+ self.pump()
+ new_timings = self.successResultOf(d)
+ self.assertEqual(new_timings["failure_ts"], failure_ts)
+ self.assertEqual(new_timings["retry_last_ts"], failure_ts)
+ self.assertEqual(new_timings["retry_interval"], MIN_RETRY_INTERVAL)
+
+ # now if we try again we should get a failure
+ d = get_retry_limiter("test_dest", self.clock, store)
+ self.pump()
+ self.failureResultOf(d, NotRetryingDestination)
+
+ #
+ # advance the clock and try again
+ #
+
+ self.pump(MIN_RETRY_INTERVAL)
+ d = get_retry_limiter("test_dest", self.clock, store)
+ self.pump()
+ limiter = self.successResultOf(d)
+
+ self.pump(1)
+ try:
+ with limiter:
+ self.pump(1)
+ retry_ts = self.clock.time_msec()
+ raise AssertionError("argh")
+ except AssertionError:
+ pass
+
+ # wait for the update to land
+ self.pump()
+
+ d = store.get_destination_retry_timings("test_dest")
+ self.pump()
+ new_timings = self.successResultOf(d)
+ self.assertEqual(new_timings["failure_ts"], failure_ts)
+ self.assertEqual(new_timings["retry_last_ts"], retry_ts)
+ self.assertGreaterEqual(
+ new_timings["retry_interval"], MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 0.5
+ )
+ self.assertLessEqual(
+ new_timings["retry_interval"], MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0
+ )
+
+ #
+ # one more go, with success
+ #
+ self.pump(MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0)
+ d = get_retry_limiter("test_dest", self.clock, store)
+ self.pump()
+ limiter = self.successResultOf(d)
+
+ self.pump(1)
+ with limiter:
+ self.pump(1)
+
+ # wait for the update to land
+ self.pump()
+
+ d = store.get_destination_retry_timings("test_dest")
+ self.pump()
+ new_timings = self.successResultOf(d)
+ self.assertIsNone(new_timings)
|