summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--.circleci/config.yml54
-rw-r--r--.gitignore1
-rw-r--r--.travis.yml8
-rw-r--r--CHANGES.md74
-rw-r--r--CONTRIBUTING.rst38
-rw-r--r--MANIFEST.in1
-rw-r--r--changelog.d/3576.feature1
-rw-r--r--changelog.d/3578.bugfix1
-rw-r--r--changelog.d/3699.misc2
-rw-r--r--changelog.d/3704.misc1
-rw-r--r--changelog.d/3771.misc1
-rw-r--r--changelog.d/3788.bugfix1
-rw-r--r--changelog.d/3789.misc1
-rw-r--r--changelog.d/3790.feature1
-rw-r--r--changelog.d/3795.misc1
-rw-r--r--changelog.d/3800.bugfix1
-rw-r--r--changelog.d/3803.misc1
-rw-r--r--changelog.d/3804.bugfix1
-rw-r--r--changelog.d/3805.misc1
-rw-r--r--changelog.d/3806.misc1
-rw-r--r--changelog.d/3808.misc1
-rw-r--r--changelog.d/3810.bugfix1
-rw-r--r--changelog.d/3822.misc1
-rw-r--r--changelog.d/3823.misc1
-rw-r--r--changelog.d/3824.bugfix1
-rw-r--r--changelog.d/3826.misc1
-rw-r--r--changelog.d/3827.misc1
-rw-r--r--changelog.d/3834.misc1
-rw-r--r--changelog.d/3835.bugfix1
-rw-r--r--changelog.d/3840.misc1
-rw-r--r--changelog.d/3841.bugfix1
-rw-r--r--changelog.d/3845.bugfix1
-rw-r--r--changelog.d/3846.feature1
-rw-r--r--changelog.d/3847.misc1
-rw-r--r--changelog.d/3851.bugfix1
-rw-r--r--changelog.d/3853.misc1
-rw-r--r--changelog.d/3855.misc1
-rw-r--r--changelog.d/3856.misc1
-rw-r--r--changelog.d/3857.misc1
-rw-r--r--changelog.d/3858.misc1
-rw-r--r--changelog.d/3859.bugfix1
-rw-r--r--changelog.d/3860.misc1
-rw-r--r--changelog.d/3868.bugfix1
-rw-r--r--changelog.d/3871.misc1
-rw-r--r--changelog.d/3872.misc1
-rw-r--r--changelog.d/3874.bugfix0
-rw-r--r--changelog.d/3875.bugfix1
-rw-r--r--changelog.d/3877.misc1
-rw-r--r--changelog.d/3888.misc1
-rw-r--r--changelog.d/3904.misc1
-rw-r--r--changelog.d/3908.bugfix1
-rw-r--r--changelog.d/3911.misc1
-rw-r--r--changelog.d/3912.misc1
-rw-r--r--changelog.d/3914.bugfix1
-rw-r--r--changelog.d/3916.feature1
-rw-r--r--changelog.d/3924.misc1
-rw-r--r--changelog.d/3925.misc1
-rw-r--r--changelog.d/3927.misc1
-rw-r--r--changelog.d/3932.bugfix1
-rw-r--r--changelog.d/3936.bugfix1
-rw-r--r--changelog.d/3946.misc1
-rw-r--r--changelog.d/3947.misc1
-rw-r--r--changelog.d/3948.misc1
-rw-r--r--changelog.d/3952.misc1
-rw-r--r--changelog.d/3956.bugfix1
-rw-r--r--changelog.d/3957.misc1
-rw-r--r--changelog.d/3958.misc1
-rw-r--r--changelog.d/3959.feature1
-rw-r--r--changelog.d/3961.bugfix1
-rw-r--r--changelog.d/3963.misc1
-rw-r--r--changelog.d/3965.misc1
-rw-r--r--changelog.d/3966.misc1
-rw-r--r--changelog.d/3967.misc1
-rw-r--r--changelog.d/3970.bugfix1
-rw-r--r--contrib/grafana/synapse.json592
-rw-r--r--docker/Dockerfile3
-rw-r--r--docker/Dockerfile-pgtests12
-rwxr-xr-xdocker/run_pg_tests.sh20
-rwxr-xr-xdocker/start.py3
-rwxr-xr-xscripts-dev/dump_macaroon.py2
-rwxr-xr-xscripts-dev/next_github_number.sh9
-rw-r--r--synapse/__init__.py2
-rw-r--r--synapse/api/filtering.py2
-rw-r--r--synapse/app/__init__.py2
-rw-r--r--synapse/app/appservice.py3
-rw-r--r--synapse/app/client_reader.py3
-rw-r--r--synapse/app/event_creator.py3
-rw-r--r--synapse/app/federation_reader.py3
-rw-r--r--synapse/app/federation_sender.py3
-rw-r--r--synapse/app/frontend_proxy.py3
-rwxr-xr-xsynapse/app/homeserver.py5
-rw-r--r--synapse/app/media_repository.py3
-rw-r--r--synapse/app/pusher.py3
-rw-r--r--synapse/app/synchrotron.py3
-rwxr-xr-xsynapse/app/synctl.py284
-rw-r--r--synapse/app/user_dir.py3
-rw-r--r--synapse/config/__main__.py2
-rw-r--r--synapse/federation/federation_client.py38
-rw-r--r--synapse/federation/federation_server.py34
-rw-r--r--synapse/handlers/e2e_keys.py2
-rw-r--r--synapse/handlers/federation.py286
-rw-r--r--synapse/handlers/profile.py2
-rw-r--r--synapse/handlers/sync.py9
-rw-r--r--synapse/http/server.py49
-rw-r--r--synapse/http/site.py6
-rw-r--r--synapse/metrics/background_process_metrics.py6
-rw-r--r--synapse/notifier.py8
-rw-r--r--synapse/push/mailer.py2
-rw-r--r--synapse/python_dependencies.py31
-rw-r--r--synapse/rest/media/v1/download_resource.py1
-rw-r--r--synapse/rest/media/v1/preview_url_resource.py1
-rw-r--r--synapse/state/__init__.py9
-rw-r--r--synapse/state/v1.py14
-rw-r--r--synapse/storage/client_ips.py34
-rw-r--r--synapse/storage/monthly_active_users.py5
-rw-r--r--synapse/storage/state.py30
-rw-r--r--synapse/storage/transactions.py8
-rw-r--r--synapse/util/caches/__init__.py27
-rw-r--r--synapse/util/caches/expiringcache.py18
-rw-r--r--synapse/util/logcontext.py41
-rw-r--r--synapse/util/retryutils.py2
-rwxr-xr-x[l---------]synctl295
-rwxr-xr-xtest_postgresql.sh12
-rw-r--r--tests/http/test_fedclient.py4
-rw-r--r--tests/server.py8
-rw-r--r--tests/storage/test_client_ips.py202
-rw-r--r--tests/storage/test_state.py39
-rw-r--r--tests/test_federation.py28
-rw-r--r--tests/test_server.py74
-rw-r--r--tests/unittest.py12
-rw-r--r--tests/util/test_expiring_cache.py1
-rw-r--r--tests/util/test_logcontext.py5
-rw-r--r--tests/utils.py27
-rw-r--r--tox.ini32
134 files changed, 1770 insertions, 844 deletions
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 605430fb3f..3cb14793fc 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,5 +1,21 @@
 version: 2
 jobs:
+  dockerhubuploadrelease:
+    machine: true
+    steps:
+      - checkout
+      - run: docker build -f docker/Dockerfile -t matrixdotorg/synapse:$CIRCLE_TAG .
+      - run: docker login --username $DOCKER_HUB_USERNAME --password $DOCKER_HUB_PASSWORD
+      - run: docker push matrixdotorg/synapse:$CIRCLE_TAG
+  dockerhubuploadlatest:
+    machine: true
+    steps:
+      - checkout
+      - run: docker build -f docker/Dockerfile -t matrixdotorg/synapse:$CIRCLE_SHA1 .
+      - run: docker login --username $DOCKER_HUB_USERNAME --password $DOCKER_HUB_PASSWORD
+      - run: docker tag matrixdotorg/synapse:$CIRCLE_SHA1 matrixdotorg/synapse:latest
+      - run: docker push matrixdotorg/synapse:$CIRCLE_SHA1
+      - run: docker push matrixdotorg/synapse:latest
   sytestpy2:
     machine: true
     steps:
@@ -99,23 +115,45 @@ workflows:
   version: 2
   build:
     jobs:
-      - sytestpy2
-      - sytestpy2postgres
-      - sytestpy3
-      - sytestpy3postgres
+      - sytestpy2:
+          filters:
+            branches:
+              only: /develop|master|release-.*/
+      - sytestpy2postgres:
+          filters:
+            branches:
+              only: /develop|master|release-.*/
+      - sytestpy3:
+          filters:
+            branches:
+              only: /develop|master|release-.*/
+      - sytestpy3postgres:
+          filters:
+            branches:
+              only: /develop|master|release-.*/
       - sytestpy2merged:
           filters:
             branches:
-              ignore: /develop|master/
+              ignore: /develop|master|release-.*/
       - sytestpy2postgresmerged:
           filters:
             branches:
-              ignore: /develop|master/
+              ignore: /develop|master|release-.*/
       - sytestpy3merged:
           filters:
             branches:
-              ignore: /develop|master/
+              ignore: /develop|master|release-.*/
       - sytestpy3postgresmerged:
           filters:
             branches:
-              ignore: /develop|master/
+              ignore: /develop|master|release-.*/
+      - dockerhubuploadrelease:
+          filters:
+            tags:
+              only: /^v[0-9].[0-9]+.[0-9]+(.[0-9]+)?/
+            branches:
+              ignore: /.*/
+      - dockerhubuploadlatest:
+          filters:
+            branches:
+              only: master
diff --git a/.gitignore b/.gitignore
index a725725235..3b2252ad8a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 *.pyc
 .*.swp
 *~
+*.lock
 
 .DS_Store
 _trial_temp/
diff --git a/.travis.yml b/.travis.yml
index b3ee66da8f..2077f6af72 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -21,6 +21,9 @@ matrix:
     env: TOX_ENV=py27
 
   - python: 2.7
+    env: TOX_ENV=py27-old
+
+  - python: 2.7
     env: TOX_ENV=py27-postgres TRIAL_FLAGS="-j 4"
     services:
       - postgresql
@@ -32,6 +35,11 @@ matrix:
     env: TOX_ENV=py36
 
   - python: 3.6
+    env: TOX_ENV=py36-postgres TRIAL_FLAGS="-j 4"
+    services:
+      - postgresql
+
+  - python: 3.6
     env: TOX_ENV=check_isort
 
   - python: 3.6
diff --git a/CHANGES.md b/CHANGES.md
index ee864c3c63..45d3cdb131 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,77 @@
+Synapse 0.33.5.1 (2018-09-25)
+=============================
+
+Internal Changes
+----------------
+
+- Fix incompatibility with older Twisted version in tests. Thanks 
+  @OlegGirko! ([\#3940](https://github.com/matrix-org/synapse/issues/3940))
+
+
+Synapse 0.33.5 (2018-09-24)
+===========================
+
+No significant changes.
+
+
+Synapse 0.33.5rc1 (2018-09-17)
+==============================
+
+Features
+--------
+
+- Python 3.5 and 3.6 support is now in beta. ([\#3576](https://github.com/matrix-org/synapse/issues/3576))
+- Implement `event_format` filter param in `/sync` ([\#3790](https://github.com/matrix-org/synapse/issues/3790))
+- Add synapse_admin_mau:registered_reserved_users metric to expose number of real reaserved users ([\#3846](https://github.com/matrix-org/synapse/issues/3846))
+
+
+Bugfixes
+--------
+
+- Remove connection ID for replication prometheus metrics, as it creates a large number of new series. ([\#3788](https://github.com/matrix-org/synapse/issues/3788))
+- guest users should not be part of mau total ([\#3800](https://github.com/matrix-org/synapse/issues/3800))
+- Bump dependency on pyopenssl 16.x, to avoid incompatibility with recent Twisted. ([\#3804](https://github.com/matrix-org/synapse/issues/3804))
+- Fix existing room tags not coming down sync when joining a room ([\#3810](https://github.com/matrix-org/synapse/issues/3810))
+- Fix jwt import check ([\#3824](https://github.com/matrix-org/synapse/issues/3824))
+- fix VOIP crashes under Python 3 (#3821) ([\#3835](https://github.com/matrix-org/synapse/issues/3835))
+- Fix manhole so that it works with latest openssh clients ([\#3841](https://github.com/matrix-org/synapse/issues/3841))
+- Fix outbound requests occasionally wedging, which can result in federation breaking between servers. ([\#3845](https://github.com/matrix-org/synapse/issues/3845))
+- Show heroes if room name/canonical alias has been deleted ([\#3851](https://github.com/matrix-org/synapse/issues/3851))
+- Fix handling of redacted events from federation ([\#3859](https://github.com/matrix-org/synapse/issues/3859))
+-  ([\#3874](https://github.com/matrix-org/synapse/issues/3874))
+- Mitigate outbound federation randomly becoming wedged ([\#3875](https://github.com/matrix-org/synapse/issues/3875))
+
+
+Internal Changes
+----------------
+
+- CircleCI tests now run on the potential merge of a PR. ([\#3704](https://github.com/matrix-org/synapse/issues/3704))
+- http/ is now ported to Python 3. ([\#3771](https://github.com/matrix-org/synapse/issues/3771))
+- Improve human readable error messages for threepid registration/account update ([\#3789](https://github.com/matrix-org/synapse/issues/3789))
+- Make /sync slightly faster by avoiding needless copies ([\#3795](https://github.com/matrix-org/synapse/issues/3795))
+- handlers/ is now ported to Python 3. ([\#3803](https://github.com/matrix-org/synapse/issues/3803))
+- Limit the number of PDUs/EDUs per federation transaction ([\#3805](https://github.com/matrix-org/synapse/issues/3805))
+- Only start postgres instance for postgres tests on Travis CI ([\#3806](https://github.com/matrix-org/synapse/issues/3806))
+- tests/ is now ported to Python 3. ([\#3808](https://github.com/matrix-org/synapse/issues/3808))
+- crypto/ is now ported to Python 3. ([\#3822](https://github.com/matrix-org/synapse/issues/3822))
+- rest/ is now ported to Python 3. ([\#3823](https://github.com/matrix-org/synapse/issues/3823))
+- add some logging for the keyring queue ([\#3826](https://github.com/matrix-org/synapse/issues/3826))
+- speed up lazy loading by 2-3x ([\#3827](https://github.com/matrix-org/synapse/issues/3827))
+- Improved Dockerfile to remove build requirements after building reducing the image size. ([\#3834](https://github.com/matrix-org/synapse/issues/3834))
+- Disable lazy loading for incremental syncs for now ([\#3840](https://github.com/matrix-org/synapse/issues/3840))
+- federation/ is now ported to Python 3. ([\#3847](https://github.com/matrix-org/synapse/issues/3847))
+- Log when we retry outbound requests ([\#3853](https://github.com/matrix-org/synapse/issues/3853))
+- Removed some excess logging messages. ([\#3855](https://github.com/matrix-org/synapse/issues/3855))
+- Speed up purge history for rooms that have been previously purged ([\#3856](https://github.com/matrix-org/synapse/issues/3856))
+- Refactor some HTTP timeout code. ([\#3857](https://github.com/matrix-org/synapse/issues/3857))
+- Fix running merged builds on CircleCI ([\#3858](https://github.com/matrix-org/synapse/issues/3858))
+- Fix typo in replication stream exception. ([\#3860](https://github.com/matrix-org/synapse/issues/3860))
+- Add in flight real time metrics for Measure blocks ([\#3871](https://github.com/matrix-org/synapse/issues/3871))
+- Disable buffering and automatic retrying in treq requests to prevent timeouts. ([\#3872](https://github.com/matrix-org/synapse/issues/3872))
+- mention jemalloc in the README ([\#3877](https://github.com/matrix-org/synapse/issues/3877))
+- Remove unmaintained "nuke-room-from-db.sh" script ([\#3888](https://github.com/matrix-org/synapse/issues/3888))
+
+
 Synapse 0.33.4 (2018-09-07)
 ===========================
 
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index f9de78a460..6ef7d48dc7 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -30,12 +30,28 @@ use github's pull request workflow to review the contribution, and either ask
 you to make any refinements needed or merge it and make them ourselves. The
 changes will then land on master when we next do a release.
 
-We use `Jenkins <http://matrix.org/jenkins>`_ and
-`Travis <https://travis-ci.org/matrix-org/synapse>`_ for continuous
-integration. All pull requests to synapse get automatically tested by Travis;
-the Jenkins builds require an adminstrator to start them. If your change
-breaks the build, this will be shown in github, so please keep an eye on the
-pull request for feedback.
+We use `CircleCI <https://circleci.com/gh/matrix-org>`_ and `Travis CI 
+<https://travis-ci.org/matrix-org/synapse>`_ for continuous integration. All
+pull requests to synapse get automatically tested by Travis and CircleCI.
+If your change breaks the build, this will be shown in GitHub, so please
+keep an eye on the pull request for feedback.
+
+To run unit tests in a local development environment, you can use:
+
+- ``tox -e py27`` (requires tox to be installed by ``pip install tox``) for
+  SQLite-backed Synapse on Python 2.7.
+- ``tox -e py35`` for SQLite-backed Synapse on Python 3.5.
+- ``tox -e py36`` for SQLite-backed Synapse on Python 3.6.
+- ``tox -e py27-postgres`` for PostgreSQL-backed Synapse on Python 2.7
+  (requires a running local PostgreSQL with access to create databases).
+- ``./test_postgresql.sh`` for PostgreSQL-backed Synapse on Python 2.7
+  (requires Docker). Entirely self-contained, recommended if you don't want to
+  set up PostgreSQL yourself.
+
+Docker images are available for running the integration tests (SyTest) locally,
+see the `documentation in the SyTest repo
+<https://github.com/matrix-org/sytest/blob/develop/docker/README.md>`_ for more
+information.
 
 Code style
 ~~~~~~~~~~
@@ -77,7 +93,8 @@ AUTHORS.rst file for the project in question. Please feel free to include a
 change to AUTHORS.rst in your pull request to list yourself and a short
 description of the area(s) you've worked on. Also, we sometimes have swag to
 give away to contributors - if you feel that Matrix-branded apparel is missing
-from your life, please mail us your shipping address to matrix at matrix.org and we'll try to fix it :)
+from your life, please mail us your shipping address to matrix at matrix.org and
+we'll try to fix it :)
 
 Sign off
 ~~~~~~~~
@@ -144,4 +161,9 @@ flag to ``git commit``, which uses the name and email set in your
 Conclusion
 ~~~~~~~~~~
 
-That's it!  Matrix is a very open and collaborative project as you might expect given our obsession with open communication.  If we're going to successfully matrix together all the fragmented communication technologies out there we are reliant on contributions and collaboration from the community to do so.  So please get involved - and we hope you have as much fun hacking on Matrix as we do!
+That's it!  Matrix is a very open and collaborative project as you might expect
+given our obsession with open communication.  If we're going to successfully
+matrix together all the fragmented communication technologies out there we are
+reliant on contributions and collaboration from the community to do so.  So
+please get involved - and we hope you have as much fun hacking on Matrix as we
+do!
diff --git a/MANIFEST.in b/MANIFEST.in
index e0826ba544..47ae5a77b9 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -28,6 +28,7 @@ exclude jenkins*.sh
 exclude jenkins*
 exclude Dockerfile
 exclude .dockerignore
+exclude test_postgresql.sh
 recursive-exclude jenkins *.sh
 
 include pyproject.toml
diff --git a/changelog.d/3576.feature b/changelog.d/3576.feature
deleted file mode 100644
index 02a10e370d..0000000000
--- a/changelog.d/3576.feature
+++ /dev/null
@@ -1 +0,0 @@
-Python 3.5+ is now supported.
diff --git a/changelog.d/3578.bugfix b/changelog.d/3578.bugfix
new file mode 100644
index 0000000000..9c52b6fa7e
--- /dev/null
+++ b/changelog.d/3578.bugfix
@@ -0,0 +1 @@
+Fix problem when playing media from Chrome using direct URL (thanks @remjey!)
diff --git a/changelog.d/3699.misc b/changelog.d/3699.misc
new file mode 100644
index 0000000000..437efbd98f
--- /dev/null
+++ b/changelog.d/3699.misc
@@ -0,0 +1,2 @@
+Unit tests can now be run under PostgreSQL in Docker using 
+``test_postgresql.sh``.
diff --git a/changelog.d/3704.misc b/changelog.d/3704.misc
deleted file mode 100644
index aaae0fbd63..0000000000
--- a/changelog.d/3704.misc
+++ /dev/null
@@ -1 +0,0 @@
-CircleCI tests now run on the potential merge of a PR.
diff --git a/changelog.d/3771.misc b/changelog.d/3771.misc
deleted file mode 100644
index 47aa34bc04..0000000000
--- a/changelog.d/3771.misc
+++ /dev/null
@@ -1 +0,0 @@
-http/ is now ported to Python 3.
diff --git a/changelog.d/3788.bugfix b/changelog.d/3788.bugfix
deleted file mode 100644
index 72316fb881..0000000000
--- a/changelog.d/3788.bugfix
+++ /dev/null
@@ -1 +0,0 @@
-Remove connection ID for replication prometheus metrics, as it creates a large number of new series.
diff --git a/changelog.d/3789.misc b/changelog.d/3789.misc
deleted file mode 100644
index d2d5d91091..0000000000
--- a/changelog.d/3789.misc
+++ /dev/null
@@ -1 +0,0 @@
-Improve human readable error messages for threepid registration/account update
diff --git a/changelog.d/3790.feature b/changelog.d/3790.feature
deleted file mode 100644
index 2c4ac62fb5..0000000000
--- a/changelog.d/3790.feature
+++ /dev/null
@@ -1 +0,0 @@
-Implement `event_format` filter param in `/sync`
diff --git a/changelog.d/3795.misc b/changelog.d/3795.misc
deleted file mode 100644
index 9f64ee5e2b..0000000000
--- a/changelog.d/3795.misc
+++ /dev/null
@@ -1 +0,0 @@
-Make /sync slightly faster by avoiding needless copies
diff --git a/changelog.d/3800.bugfix b/changelog.d/3800.bugfix
deleted file mode 100644
index 6b2e18b4a6..0000000000
--- a/changelog.d/3800.bugfix
+++ /dev/null
@@ -1 +0,0 @@
-guest users should not be part of mau total
diff --git a/changelog.d/3803.misc b/changelog.d/3803.misc
deleted file mode 100644
index 2b60653c29..0000000000
--- a/changelog.d/3803.misc
+++ /dev/null
@@ -1 +0,0 @@
-handlers/ is now ported to Python 3.
diff --git a/changelog.d/3804.bugfix b/changelog.d/3804.bugfix
deleted file mode 100644
index a0cef20e3f..0000000000
--- a/changelog.d/3804.bugfix
+++ /dev/null
@@ -1 +0,0 @@
-Bump dependency on pyopenssl 16.x, to avoid incompatibility with recent Twisted.
diff --git a/changelog.d/3805.misc b/changelog.d/3805.misc
deleted file mode 100644
index 257feeb071..0000000000
--- a/changelog.d/3805.misc
+++ /dev/null
@@ -1 +0,0 @@
-Limit the number of PDUs/EDUs per federation transaction
diff --git a/changelog.d/3806.misc b/changelog.d/3806.misc
deleted file mode 100644
index 3c722eef2d..0000000000
--- a/changelog.d/3806.misc
+++ /dev/null
@@ -1 +0,0 @@
-Only start postgres instance for postgres tests on Travis CI
diff --git a/changelog.d/3808.misc b/changelog.d/3808.misc
deleted file mode 100644
index e5e1cd9e0e..0000000000
--- a/changelog.d/3808.misc
+++ /dev/null
@@ -1 +0,0 @@
-tests/ is now ported to Python 3.
diff --git a/changelog.d/3810.bugfix b/changelog.d/3810.bugfix
deleted file mode 100644
index 2b938a81ae..0000000000
--- a/changelog.d/3810.bugfix
+++ /dev/null
@@ -1 +0,0 @@
-Fix existing room tags not coming down sync when joining a room
diff --git a/changelog.d/3822.misc b/changelog.d/3822.misc
deleted file mode 100644
index 5250f31896..0000000000
--- a/changelog.d/3822.misc
+++ /dev/null
@@ -1 +0,0 @@
-crypto/ is now ported to Python 3.
diff --git a/changelog.d/3823.misc b/changelog.d/3823.misc
deleted file mode 100644
index 0da491ddaa..0000000000
--- a/changelog.d/3823.misc
+++ /dev/null
@@ -1 +0,0 @@
-rest/ is now ported to Python 3.
diff --git a/changelog.d/3824.bugfix b/changelog.d/3824.bugfix
deleted file mode 100644
index 99f199dcc6..0000000000
--- a/changelog.d/3824.bugfix
+++ /dev/null
@@ -1 +0,0 @@
-Fix jwt import check
\ No newline at end of file
diff --git a/changelog.d/3826.misc b/changelog.d/3826.misc
deleted file mode 100644
index a4d9a012f9..0000000000
--- a/changelog.d/3826.misc
+++ /dev/null
@@ -1 +0,0 @@
-add some logging for the keyring queue
diff --git a/changelog.d/3827.misc b/changelog.d/3827.misc
deleted file mode 100644
index bc294706cf..0000000000
--- a/changelog.d/3827.misc
+++ /dev/null
@@ -1 +0,0 @@
-speed up lazy loading by 2-3x
diff --git a/changelog.d/3834.misc b/changelog.d/3834.misc
deleted file mode 100644
index 8902f8fba7..0000000000
--- a/changelog.d/3834.misc
+++ /dev/null
@@ -1 +0,0 @@
-Improved Dockerfile to remove build requirements after building reducing the image size.
diff --git a/changelog.d/3835.bugfix b/changelog.d/3835.bugfix
deleted file mode 100644
index 00dbcbc8dc..0000000000
--- a/changelog.d/3835.bugfix
+++ /dev/null
@@ -1 +0,0 @@
-fix VOIP crashes under Python 3 (#3821)
diff --git a/changelog.d/3840.misc b/changelog.d/3840.misc
deleted file mode 100644
index b9585ae9be..0000000000
--- a/changelog.d/3840.misc
+++ /dev/null
@@ -1 +0,0 @@
-Disable lazy loading for incremental syncs for now
diff --git a/changelog.d/3841.bugfix b/changelog.d/3841.bugfix
deleted file mode 100644
index 2a48a7dd66..0000000000
--- a/changelog.d/3841.bugfix
+++ /dev/null
@@ -1 +0,0 @@
-Fix manhole so that it works with latest openssh clients
diff --git a/changelog.d/3845.bugfix b/changelog.d/3845.bugfix
deleted file mode 100644
index 5b7e8f1934..0000000000
--- a/changelog.d/3845.bugfix
+++ /dev/null
@@ -1 +0,0 @@
-Fix outbound requests occasionally wedging, which can result in federation breaking between servers.
diff --git a/changelog.d/3846.feature b/changelog.d/3846.feature
deleted file mode 100644
index 453c11d3f8..0000000000
--- a/changelog.d/3846.feature
+++ /dev/null
@@ -1 +0,0 @@
-Add synapse_admin_mau:registered_reserved_users metric to expose number of real reaserved users 
diff --git a/changelog.d/3847.misc b/changelog.d/3847.misc
deleted file mode 100644
index bf8b5afea4..0000000000
--- a/changelog.d/3847.misc
+++ /dev/null
@@ -1 +0,0 @@
-federation/ is now ported to Python 3.
\ No newline at end of file
diff --git a/changelog.d/3851.bugfix b/changelog.d/3851.bugfix
deleted file mode 100644
index b53a9efe7b..0000000000
--- a/changelog.d/3851.bugfix
+++ /dev/null
@@ -1 +0,0 @@
-Show heroes if room name/canonical alias has been deleted
diff --git a/changelog.d/3853.misc b/changelog.d/3853.misc
deleted file mode 100644
index db45d4983d..0000000000
--- a/changelog.d/3853.misc
+++ /dev/null
@@ -1 +0,0 @@
-Log when we retry outbound requests
diff --git a/changelog.d/3855.misc b/changelog.d/3855.misc
deleted file mode 100644
index a25bb020ba..0000000000
--- a/changelog.d/3855.misc
+++ /dev/null
@@ -1 +0,0 @@
-Removed some excess logging messages.
\ No newline at end of file
diff --git a/changelog.d/3856.misc b/changelog.d/3856.misc
deleted file mode 100644
index 36c311eb3d..0000000000
--- a/changelog.d/3856.misc
+++ /dev/null
@@ -1 +0,0 @@
-Speed up purge history for rooms that have been previously purged
diff --git a/changelog.d/3857.misc b/changelog.d/3857.misc
deleted file mode 100644
index e128d193d9..0000000000
--- a/changelog.d/3857.misc
+++ /dev/null
@@ -1 +0,0 @@
-Refactor some HTTP timeout code.
\ No newline at end of file
diff --git a/changelog.d/3858.misc b/changelog.d/3858.misc
deleted file mode 100644
index 4644db5330..0000000000
--- a/changelog.d/3858.misc
+++ /dev/null
@@ -1 +0,0 @@
-Fix running merged builds on CircleCI
\ No newline at end of file
diff --git a/changelog.d/3859.bugfix b/changelog.d/3859.bugfix
deleted file mode 100644
index ec5b172464..0000000000
--- a/changelog.d/3859.bugfix
+++ /dev/null
@@ -1 +0,0 @@
-Fix handling of redacted events from federation
diff --git a/changelog.d/3860.misc b/changelog.d/3860.misc
deleted file mode 100644
index 364239d3e3..0000000000
--- a/changelog.d/3860.misc
+++ /dev/null
@@ -1 +0,0 @@
-Fix typo in replication stream exception.
diff --git a/changelog.d/3868.bugfix b/changelog.d/3868.bugfix
new file mode 100644
index 0000000000..99da8389a5
--- /dev/null
+++ b/changelog.d/3868.bugfix
@@ -0,0 +1 @@
+Fix broken invite email links for self hosted riots 
diff --git a/changelog.d/3871.misc b/changelog.d/3871.misc
deleted file mode 100644
index dd9510ceb6..0000000000
--- a/changelog.d/3871.misc
+++ /dev/null
@@ -1 +0,0 @@
-Add in flight real time metrics for Measure blocks
diff --git a/changelog.d/3872.misc b/changelog.d/3872.misc
deleted file mode 100644
index b450c506d8..0000000000
--- a/changelog.d/3872.misc
+++ /dev/null
@@ -1 +0,0 @@
-Disable buffering and automatic retrying in treq requests to prevent timeouts.
\ No newline at end of file
diff --git a/changelog.d/3874.bugfix b/changelog.d/3874.bugfix
deleted file mode 100644
index e69de29bb2..0000000000
--- a/changelog.d/3874.bugfix
+++ /dev/null
diff --git a/changelog.d/3875.bugfix b/changelog.d/3875.bugfix
deleted file mode 100644
index 2d2147dd4b..0000000000
--- a/changelog.d/3875.bugfix
+++ /dev/null
@@ -1 +0,0 @@
-Mitigate outbound federation randomly becoming wedged
diff --git a/changelog.d/3877.misc b/changelog.d/3877.misc
deleted file mode 100644
index a80fec4bd8..0000000000
--- a/changelog.d/3877.misc
+++ /dev/null
@@ -1 +0,0 @@
-mention jemalloc in the README
diff --git a/changelog.d/3888.misc b/changelog.d/3888.misc
deleted file mode 100644
index a10ede547e..0000000000
--- a/changelog.d/3888.misc
+++ /dev/null
@@ -1 +0,0 @@
-Remove unmaintained "nuke-room-from-db.sh" script
diff --git a/changelog.d/3904.misc b/changelog.d/3904.misc
new file mode 100644
index 0000000000..1e3c8e1706
--- /dev/null
+++ b/changelog.d/3904.misc
@@ -0,0 +1 @@
+Improve the logging when handling a federation transaction
\ No newline at end of file
diff --git a/changelog.d/3908.bugfix b/changelog.d/3908.bugfix
new file mode 100644
index 0000000000..518aee6c4d
--- /dev/null
+++ b/changelog.d/3908.bugfix
@@ -0,0 +1 @@
+Fix adding client IPs to the database failing on Python 3.
\ No newline at end of file
diff --git a/changelog.d/3911.misc b/changelog.d/3911.misc
new file mode 100644
index 0000000000..e31311d520
--- /dev/null
+++ b/changelog.d/3911.misc
@@ -0,0 +1 @@
+Fix the docker image building on python 3
diff --git a/changelog.d/3912.misc b/changelog.d/3912.misc
new file mode 100644
index 0000000000..87d73697ea
--- /dev/null
+++ b/changelog.d/3912.misc
@@ -0,0 +1 @@
+Add a regression test for logging failed HTTP requests on Python 3.
\ No newline at end of file
diff --git a/changelog.d/3914.bugfix b/changelog.d/3914.bugfix
new file mode 100644
index 0000000000..27e6bad590
--- /dev/null
+++ b/changelog.d/3914.bugfix
@@ -0,0 +1 @@
+Fix bug where outbound federation would stop talking to some servers when using workers
diff --git a/changelog.d/3916.feature b/changelog.d/3916.feature
new file mode 100644
index 0000000000..13282d992b
--- /dev/null
+++ b/changelog.d/3916.feature
@@ -0,0 +1 @@
+Always LL ourselves if we're in a room
diff --git a/changelog.d/3924.misc b/changelog.d/3924.misc
new file mode 100644
index 0000000000..8d010e0bd9
--- /dev/null
+++ b/changelog.d/3924.misc
@@ -0,0 +1 @@
+Comments and interface cleanup for on_receive_pdu
\ No newline at end of file
diff --git a/changelog.d/3925.misc b/changelog.d/3925.misc
new file mode 100644
index 0000000000..3e41f78ff5
--- /dev/null
+++ b/changelog.d/3925.misc
@@ -0,0 +1 @@
+Fix spurious exceptions when remote http client closes conncetion
diff --git a/changelog.d/3927.misc b/changelog.d/3927.misc
new file mode 100644
index 0000000000..4bd8e25f67
--- /dev/null
+++ b/changelog.d/3927.misc
@@ -0,0 +1 @@
+Log exceptions thrown by background tasks
diff --git a/changelog.d/3932.bugfix b/changelog.d/3932.bugfix
new file mode 100644
index 0000000000..7578414ede
--- /dev/null
+++ b/changelog.d/3932.bugfix
@@ -0,0 +1 @@
+Fix some instances of ExpiringCache not expiring cache items
diff --git a/changelog.d/3936.bugfix b/changelog.d/3936.bugfix
new file mode 100644
index 0000000000..49b02b9e27
--- /dev/null
+++ b/changelog.d/3936.bugfix
@@ -0,0 +1 @@
+Fix out-of-bounds error when LLing yourself
diff --git a/changelog.d/3946.misc b/changelog.d/3946.misc
new file mode 100644
index 0000000000..803857a297
--- /dev/null
+++ b/changelog.d/3946.misc
@@ -0,0 +1 @@
+Automate pushes to docker hub
diff --git a/changelog.d/3947.misc b/changelog.d/3947.misc
new file mode 100644
index 0000000000..5a9a22bed9
--- /dev/null
+++ b/changelog.d/3947.misc
@@ -0,0 +1 @@
+Require attrs 16.0.0 or later
diff --git a/changelog.d/3948.misc b/changelog.d/3948.misc
new file mode 100644
index 0000000000..a93edbd1c3
--- /dev/null
+++ b/changelog.d/3948.misc
@@ -0,0 +1 @@
+Fix incompatibility with python3 on alpine
\ No newline at end of file
diff --git a/changelog.d/3952.misc b/changelog.d/3952.misc
new file mode 100644
index 0000000000..015e4a43e6
--- /dev/null
+++ b/changelog.d/3952.misc
@@ -0,0 +1 @@
+Run the test suite on the oldest supported versions of our dependencies in CI.
\ No newline at end of file
diff --git a/changelog.d/3956.bugfix b/changelog.d/3956.bugfix
new file mode 100644
index 0000000000..b0828c9fc6
--- /dev/null
+++ b/changelog.d/3956.bugfix
@@ -0,0 +1 @@
+Fix exceptions from metrics handler
\ No newline at end of file
diff --git a/changelog.d/3957.misc b/changelog.d/3957.misc
new file mode 100644
index 0000000000..69d647f119
--- /dev/null
+++ b/changelog.d/3957.misc
@@ -0,0 +1 @@
+CircleCI now only runs merged jobs on PRs, and commit jobs on develop, master, and release branches.
diff --git a/changelog.d/3958.misc b/changelog.d/3958.misc
new file mode 100644
index 0000000000..5931d06dcf
--- /dev/null
+++ b/changelog.d/3958.misc
@@ -0,0 +1 @@
+Fix docstrings and add tests for state store methods
diff --git a/changelog.d/3959.feature b/changelog.d/3959.feature
new file mode 100644
index 0000000000..b3a4f37a8d
--- /dev/null
+++ b/changelog.d/3959.feature
@@ -0,0 +1 @@
+Include eventid in log lines when processing incoming federation transactions
\ No newline at end of file
diff --git a/changelog.d/3961.bugfix b/changelog.d/3961.bugfix
new file mode 100644
index 0000000000..e46b5834aa
--- /dev/null
+++ b/changelog.d/3961.bugfix
@@ -0,0 +1 @@
+Fix errors due to concurrent monthly_active_user upserts 
diff --git a/changelog.d/3963.misc b/changelog.d/3963.misc
new file mode 100644
index 0000000000..f1e0eaf18e
--- /dev/null
+++ b/changelog.d/3963.misc
@@ -0,0 +1 @@
+fix docstring for FederationClient.get_state_for_room
diff --git a/changelog.d/3965.misc b/changelog.d/3965.misc
new file mode 100644
index 0000000000..e7e4a9c5a8
--- /dev/null
+++ b/changelog.d/3965.misc
@@ -0,0 +1 @@
+Run notify_app_services as a bg process
diff --git a/changelog.d/3966.misc b/changelog.d/3966.misc
new file mode 100644
index 0000000000..1e3c8e1706
--- /dev/null
+++ b/changelog.d/3966.misc
@@ -0,0 +1 @@
+Improve the logging when handling a federation transaction
\ No newline at end of file
diff --git a/changelog.d/3967.misc b/changelog.d/3967.misc
new file mode 100644
index 0000000000..dc808aec73
--- /dev/null
+++ b/changelog.d/3967.misc
@@ -0,0 +1 @@
+Clarifications in FederationHandler
diff --git a/changelog.d/3970.bugfix b/changelog.d/3970.bugfix
new file mode 100644
index 0000000000..5625315497
--- /dev/null
+++ b/changelog.d/3970.bugfix
@@ -0,0 +1 @@
+Replaced all occurences of e.message with str(e). Contributed by Schnuffle
diff --git a/contrib/grafana/synapse.json b/contrib/grafana/synapse.json
index c58612594a..dc3f4a1d1c 100644
--- a/contrib/grafana/synapse.json
+++ b/contrib/grafana/synapse.json
@@ -14,7 +14,7 @@
       "type": "grafana",
       "id": "grafana",
       "name": "Grafana",
-      "version": "5.2.0"
+      "version": "5.2.4"
     },
     {
       "type": "panel",
@@ -54,7 +54,7 @@
   "gnetId": null,
   "graphTooltip": 0,
   "id": null,
-  "iteration": 1533598785368,
+  "iteration": 1537878047048,
   "links": [
     {
       "asDropdown": true,
@@ -86,7 +86,7 @@
       "bars": false,
       "dashLength": 10,
       "dashes": false,
-      "datasource": "${DS_PROMETHEUS}",
+      "datasource": "$datasource",
       "fill": 1,
       "gridPos": {
         "h": 9,
@@ -118,7 +118,7 @@
       "steppedLine": false,
       "targets": [
         {
-          "expr": "process_cpu_seconds:rate2m{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}",
+          "expr": "rate(process_cpu_seconds_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
           "format": "time_series",
           "intervalFactor": 1,
           "legendFormat": "{{job}}-{{index}} ",
@@ -179,7 +179,7 @@
         "mode": "spectrum"
       },
       "dataFormat": "tsbuckets",
-      "datasource": "${DS_PROMETHEUS}",
+      "datasource": "$datasource",
       "gridPos": {
         "h": 9,
         "w": 12,
@@ -525,7 +525,7 @@
             "x": 0,
             "y": 25
           },
-          "id": 48,
+          "id": 50,
           "legend": {
             "avg": false,
             "current": false,
@@ -549,8 +549,9 @@
           "steppedLine": false,
           "targets": [
             {
-              "expr": "rate(synapse_storage_schedule_time_sum{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])/rate(synapse_storage_schedule_time_count[$bucket_size])",
+              "expr": "rate(python_twisted_reactor_tick_time_sum{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])/rate(python_twisted_reactor_tick_time_count[$bucket_size])",
               "format": "time_series",
+              "interval": "",
               "intervalFactor": 2,
               "legendFormat": "{{job}}-{{index}}",
               "refId": "A",
@@ -560,7 +561,7 @@
           "thresholds": [],
           "timeFrom": null,
           "timeShift": null,
-          "title": "Avg time waiting for db conn",
+          "title": "Avg reactor tick time",
           "tooltip": {
             "shared": true,
             "sort": 0,
@@ -576,12 +577,11 @@
           },
           "yaxes": [
             {
-              "decimals": null,
               "format": "s",
-              "label": "",
+              "label": null,
               "logBase": 1,
               "max": null,
-              "min": "0",
+              "min": null,
               "show": true
             },
             {
@@ -604,6 +604,7 @@
           "dashLength": 10,
           "dashes": false,
           "datasource": "$datasource",
+          "description": "Shows the time in which the given percentage of reactor ticks completed, over the sampled timespan",
           "fill": 1,
           "gridPos": {
             "h": 7,
@@ -611,7 +612,7 @@
             "x": 12,
             "y": 25
           },
-          "id": 49,
+          "id": 105,
           "legend": {
             "avg": false,
             "current": false,
@@ -629,33 +630,47 @@
           "pointradius": 5,
           "points": false,
           "renderer": "flot",
-          "seriesOverrides": [
-            {
-              "alias": "/^up/",
-              "legend": false,
-              "yaxis": 2
-            }
-          ],
+          "seriesOverrides": [],
           "spaceLength": 10,
           "stack": false,
           "steppedLine": false,
           "targets": [
             {
-              "expr": "scrape_duration_seconds{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}",
+              "expr": "histogram_quantile(0.99, rate(python_twisted_reactor_tick_time_bucket{index=~\"$index\",instance=\"$instance\",job=~\"$job\"}[$bucket_size]))",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,
-              "legendFormat": "{{job}}-{{index}}",
+              "legendFormat": "{{job}}-{{index}} 99%",
               "refId": "A",
               "step": 20
+            },
+            {
+              "expr": "histogram_quantile(0.95, rate(python_twisted_reactor_tick_time_bucket{index=~\"$index\",instance=\"$instance\",job=~\"$job\"}[$bucket_size]))",
+              "format": "time_series",
+              "intervalFactor": 1,
+              "legendFormat": "{{job}}-{{index}} 95%",
+              "refId": "B"
+            },
+            {
+              "expr": "histogram_quantile(0.90, rate(python_twisted_reactor_tick_time_bucket{index=~\"$index\",instance=\"$instance\",job=~\"$job\"}[$bucket_size]))",
+              "format": "time_series",
+              "intervalFactor": 1,
+              "legendFormat": "{{job}}-{{index}} 90%",
+              "refId": "C"
+            },
+            {
+              "expr": "",
+              "format": "time_series",
+              "intervalFactor": 1,
+              "refId": "D"
             }
           ],
           "thresholds": [],
           "timeFrom": null,
           "timeShift": null,
-          "title": "Prometheus scrape time",
+          "title": "Reactor tick quantiles",
           "tooltip": {
-            "shared": true,
+            "shared": false,
             "sort": 0,
             "value_type": "individual"
           },
@@ -673,16 +688,15 @@
               "label": null,
               "logBase": 1,
               "max": null,
-              "min": "0",
+              "min": null,
               "show": true
             },
             {
-              "decimals": 0,
-              "format": "none",
-              "label": "",
+              "format": "short",
+              "label": null,
               "logBase": 1,
-              "max": "0",
-              "min": "-1",
+              "max": null,
+              "min": null,
               "show": false
             }
           ],
@@ -697,14 +711,14 @@
           "dashLength": 10,
           "dashes": false,
           "datasource": "$datasource",
-          "fill": 1,
+          "fill": 0,
           "gridPos": {
             "h": 7,
             "w": 12,
             "x": 0,
             "y": 32
           },
-          "id": 50,
+          "id": 53,
           "legend": {
             "avg": false,
             "current": false,
@@ -728,19 +742,17 @@
           "steppedLine": false,
           "targets": [
             {
-              "expr": "rate(python_twisted_reactor_tick_time_sum{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])/rate(python_twisted_reactor_tick_time_count[$bucket_size])",
+              "expr": "min_over_time(up{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
               "format": "time_series",
-              "interval": "",
               "intervalFactor": 2,
               "legendFormat": "{{job}}-{{index}}",
-              "refId": "A",
-              "step": 20
+              "refId": "A"
             }
           ],
           "thresholds": [],
           "timeFrom": null,
           "timeShift": null,
-          "title": "Avg reactor tick time",
+          "title": "Up",
           "tooltip": {
             "shared": true,
             "sort": 0,
@@ -756,7 +768,7 @@
           },
           "yaxes": [
             {
-              "format": "s",
+              "format": "short",
               "label": null,
               "logBase": 1,
               "max": null,
@@ -769,7 +781,7 @@
               "logBase": 1,
               "max": null,
               "min": null,
-              "show": false
+              "show": true
             }
           ],
           "yaxis": {
@@ -783,26 +795,19 @@
           "dashLength": 10,
           "dashes": false,
           "datasource": "$datasource",
-          "editable": true,
-          "error": false,
           "fill": 1,
-          "grid": {},
           "gridPos": {
             "h": 7,
             "w": 12,
             "x": 12,
             "y": 32
           },
-          "id": 5,
+          "id": 49,
           "legend": {
-            "alignAsTable": false,
             "avg": false,
             "current": false,
-            "hideEmpty": false,
-            "hideZero": false,
             "max": false,
             "min": false,
-            "rightSide": false,
             "show": true,
             "total": false,
             "values": false
@@ -817,10 +822,9 @@
           "renderer": "flot",
           "seriesOverrides": [
             {
-              "alias": "/user/"
-            },
-            {
-              "alias": "/system/"
+              "alias": "/^up/",
+              "legend": false,
+              "yaxis": 2
             }
           ],
           "spaceLength": 10,
@@ -828,44 +832,19 @@
           "steppedLine": false,
           "targets": [
             {
-              "expr": "rate(process_cpu_system_seconds_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
-              "format": "time_series",
-              "intervalFactor": 1,
-              "legendFormat": "{{job}}-{{index}} system ",
-              "metric": "",
-              "refId": "B",
-              "step": 20
-            },
-            {
-              "expr": "rate(process_cpu_user_seconds_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
+              "expr": "scrape_duration_seconds{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}",
               "format": "time_series",
-              "hide": false,
               "interval": "",
-              "intervalFactor": 1,
-              "legendFormat": "{{job}}-{{index}} user",
+              "intervalFactor": 2,
+              "legendFormat": "{{job}}-{{index}}",
               "refId": "A",
               "step": 20
             }
           ],
-          "thresholds": [
-            {
-              "colorMode": "custom",
-              "line": true,
-              "lineColor": "rgba(216, 200, 27, 0.27)",
-              "op": "gt",
-              "value": 0.5
-            },
-            {
-              "colorMode": "custom",
-              "line": true,
-              "lineColor": "rgba(234, 112, 112, 0.22)",
-              "op": "gt",
-              "value": 0.8
-            }
-          ],
+          "thresholds": [],
           "timeFrom": null,
           "timeShift": null,
-          "title": "CPU",
+          "title": "Prometheus scrape time",
           "tooltip": {
             "shared": true,
             "sort": 0,
@@ -881,20 +860,21 @@
           },
           "yaxes": [
             {
-              "decimals": null,
-              "format": "percentunit",
-              "label": "",
+              "format": "s",
+              "label": null,
               "logBase": 1,
-              "max": "1.2",
-              "min": 0,
+              "max": null,
+              "min": "0",
               "show": true
             },
             {
-              "format": "short",
+              "decimals": 0,
+              "format": "none",
+              "label": "",
               "logBase": 1,
-              "max": null,
-              "min": null,
-              "show": true
+              "max": "0",
+              "min": "-1",
+              "show": false
             }
           ],
           "yaxis": {
@@ -907,20 +887,27 @@
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "${DS_PROMETHEUS}",
-          "fill": 0,
+          "datasource": "$datasource",
+          "editable": true,
+          "error": false,
+          "fill": 1,
+          "grid": {},
           "gridPos": {
             "h": 7,
             "w": 12,
             "x": 0,
             "y": 39
           },
-          "id": 53,
+          "id": 5,
           "legend": {
+            "alignAsTable": false,
             "avg": false,
             "current": false,
+            "hideEmpty": false,
+            "hideZero": false,
             "max": false,
             "min": false,
+            "rightSide": false,
             "show": true,
             "total": false,
             "values": false
@@ -933,23 +920,57 @@
           "pointradius": 5,
           "points": false,
           "renderer": "flot",
-          "seriesOverrides": [],
+          "seriesOverrides": [
+            {
+              "alias": "/user/"
+            },
+            {
+              "alias": "/system/"
+            }
+          ],
           "spaceLength": 10,
           "stack": false,
           "steppedLine": false,
           "targets": [
             {
-              "expr": "min_over_time(up{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
+              "expr": "rate(process_cpu_system_seconds_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
               "format": "time_series",
-              "intervalFactor": 2,
-              "legendFormat": "{{job}}-{{index}}",
-              "refId": "A"
+              "intervalFactor": 1,
+              "legendFormat": "{{job}}-{{index}} system ",
+              "metric": "",
+              "refId": "B",
+              "step": 20
+            },
+            {
+              "expr": "rate(process_cpu_user_seconds_total{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
+              "format": "time_series",
+              "hide": false,
+              "interval": "",
+              "intervalFactor": 1,
+              "legendFormat": "{{job}}-{{index}} user",
+              "refId": "A",
+              "step": 20
+            }
+          ],
+          "thresholds": [
+            {
+              "colorMode": "custom",
+              "line": true,
+              "lineColor": "rgba(216, 200, 27, 0.27)",
+              "op": "gt",
+              "value": 0.5
+            },
+            {
+              "colorMode": "custom",
+              "line": true,
+              "lineColor": "rgba(234, 112, 112, 0.22)",
+              "op": "gt",
+              "value": 0.8
             }
           ],
-          "thresholds": [],
           "timeFrom": null,
           "timeShift": null,
-          "title": "Up",
+          "title": "CPU",
           "tooltip": {
             "shared": true,
             "sort": 0,
@@ -965,16 +986,16 @@
           },
           "yaxes": [
             {
-              "format": "short",
-              "label": null,
+              "decimals": null,
+              "format": "percentunit",
+              "label": "",
               "logBase": 1,
-              "max": null,
-              "min": null,
+              "max": "1.2",
+              "min": 0,
               "show": true
             },
             {
               "format": "short",
-              "label": null,
               "logBase": 1,
               "max": null,
               "min": null,
@@ -1013,7 +1034,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 49
+            "y": 47
           },
           "id": 40,
           "legend": {
@@ -1098,7 +1119,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 49
+            "y": 47
           },
           "id": 46,
           "legend": {
@@ -1187,7 +1208,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 56
+            "y": 54
           },
           "id": 44,
           "legend": {
@@ -1276,7 +1297,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 56
+            "y": 54
           },
           "id": 45,
           "legend": {
@@ -1383,7 +1404,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 48
+            "y": 62
           },
           "id": 4,
           "legend": {
@@ -1490,7 +1511,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 48
+            "y": 62
           },
           "id": 32,
           "legend": {
@@ -1578,7 +1599,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 56
+            "y": 70
           },
           "id": 23,
           "legend": {
@@ -1688,7 +1709,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 56
+            "y": 70
           },
           "id": 52,
           "legend": {
@@ -1795,7 +1816,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 64
+            "y": 78
           },
           "id": 7,
           "legend": {
@@ -1886,7 +1907,7 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 64
+            "y": 78
           },
           "id": 47,
           "legend": {
@@ -1969,13 +1990,13 @@
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "${DS_PROMETHEUS}",
+          "datasource": "$datasource",
           "fill": 1,
           "gridPos": {
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 72
+            "y": 86
           },
           "id": 103,
           "legend": {
@@ -2069,13 +2090,13 @@
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "${DS_PROMETHEUS}",
+          "datasource": "$datasource",
           "fill": 1,
           "gridPos": {
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 23
+            "y": 49
           },
           "id": 99,
           "legend": {
@@ -2154,13 +2175,13 @@
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "${DS_PROMETHEUS}",
+          "datasource": "$datasource",
           "fill": 1,
           "gridPos": {
             "h": 9,
             "w": 12,
             "x": 12,
-            "y": 23
+            "y": 49
           },
           "id": 101,
           "legend": {
@@ -2186,17 +2207,24 @@
           "steppedLine": false,
           "targets": [
             {
-              "expr": "rate(synapse_background_process_db_txn_duration_seconds{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
+              "expr": "rate(synapse_background_process_db_txn_duration_seconds{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]) +  rate(synapse_background_process_db_sched_duration_seconds{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
               "format": "time_series",
+              "hide": false,
               "intervalFactor": 1,
               "legendFormat": "{{job}}-{{index}} {{name}}",
               "refId": "A"
+            },
+            {
+              "expr": "",
+              "format": "time_series",
+              "intervalFactor": 1,
+              "refId": "B"
             }
           ],
           "thresholds": [],
           "timeFrom": null,
           "timeShift": null,
-          "title": "DB usage by background jobs",
+          "title": "DB usage by background jobs (including scheduling time)",
           "tooltip": {
             "shared": true,
             "sort": 0,
@@ -2252,13 +2280,13 @@
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "${DS_PROMETHEUS}",
+          "datasource": "$datasource",
           "fill": 1,
           "gridPos": {
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 25
+            "y": 64
           },
           "id": 79,
           "legend": {
@@ -2336,13 +2364,13 @@
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "${DS_PROMETHEUS}",
+          "datasource": "$datasource",
           "fill": 1,
           "gridPos": {
             "h": 9,
             "w": 12,
             "x": 12,
-            "y": 25
+            "y": 64
           },
           "id": 83,
           "legend": {
@@ -2447,7 +2475,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 23
+            "y": 65
           },
           "id": 51,
           "legend": {
@@ -2551,6 +2579,194 @@
           "dashLength": 10,
           "dashes": false,
           "datasource": "$datasource",
+          "fill": 1,
+          "gridPos": {
+            "h": 7,
+            "w": 12,
+            "x": 0,
+            "y": 24
+          },
+          "id": 48,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "rate(synapse_storage_schedule_time_sum{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])/rate(synapse_storage_schedule_time_count[$bucket_size])",
+              "format": "time_series",
+              "intervalFactor": 2,
+              "legendFormat": "{{job}}-{{index}}",
+              "refId": "A",
+              "step": 20
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Avg time waiting for db conn",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "decimals": null,
+              "format": "s",
+              "label": "",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "$datasource",
+          "description": "Shows the time in which the given percentage of database queries were scheduled, over the sampled timespan",
+          "fill": 1,
+          "gridPos": {
+            "h": 7,
+            "w": 12,
+            "x": 12,
+            "y": 24
+          },
+          "id": 104,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "nullPointMode": "null",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "histogram_quantile(0.99, rate(synapse_storage_schedule_time_bucket{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]))",
+              "format": "time_series",
+              "hide": false,
+              "intervalFactor": 1,
+              "legendFormat": "{{job}} {{index}} 99%",
+              "refId": "A",
+              "step": 20
+            },
+            {
+              "expr": "histogram_quantile(0.95, rate(synapse_storage_schedule_time_bucket{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]))",
+              "format": "time_series",
+              "intervalFactor": 1,
+              "legendFormat": "{{job}} {{index}} 95%",
+              "refId": "B"
+            },
+            {
+              "expr": "histogram_quantile(0.90, rate(synapse_storage_schedule_time_bucket{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]))",
+              "format": "time_series",
+              "intervalFactor": 1,
+              "legendFormat": "{{job}} {{index}} 90%",
+              "refId": "C"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Db scheduling time quantiles",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "decimals": null,
+              "format": "s",
+              "label": "",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "$datasource",
           "editable": true,
           "error": false,
           "fill": 0,
@@ -2559,7 +2775,7 @@
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 25
+            "y": 31
           },
           "id": 10,
           "legend": {
@@ -2648,7 +2864,7 @@
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 25
+            "y": 31
           },
           "id": 11,
           "legend": {
@@ -2672,11 +2888,11 @@
           "renderer": "flot",
           "seriesOverrides": [],
           "spaceLength": 10,
-          "stack": false,
+          "stack": true,
           "steppedLine": true,
           "targets": [
             {
-              "expr": "topk(5, rate(synapse_storage_transaction_time_sum{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size]))",
+              "expr": "rate(synapse_storage_transaction_time_sum{instance=\"$instance\",job=~\"$job\",index=~\"$index\"}[$bucket_size])",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -2753,7 +2969,7 @@
             "h": 13,
             "w": 12,
             "x": 0,
-            "y": 17
+            "y": 60
           },
           "id": 12,
           "legend": {
@@ -2841,7 +3057,7 @@
             "h": 13,
             "w": 12,
             "x": 12,
-            "y": 17
+            "y": 60
           },
           "id": 26,
           "legend": {
@@ -2929,7 +3145,7 @@
             "h": 13,
             "w": 12,
             "x": 0,
-            "y": 30
+            "y": 73
           },
           "id": 13,
           "legend": {
@@ -3017,7 +3233,7 @@
             "h": 13,
             "w": 12,
             "x": 12,
-            "y": 30
+            "y": 73
           },
           "id": 27,
           "legend": {
@@ -3105,7 +3321,7 @@
             "h": 13,
             "w": 12,
             "x": 0,
-            "y": 43
+            "y": 86
           },
           "id": 28,
           "legend": {
@@ -3192,7 +3408,7 @@
             "h": 13,
             "w": 12,
             "x": 12,
-            "y": 43
+            "y": 86
           },
           "id": 25,
           "legend": {
@@ -3295,7 +3511,7 @@
             "h": 10,
             "w": 12,
             "x": 0,
-            "y": 55
+            "y": 68
           },
           "id": 1,
           "legend": {
@@ -3387,7 +3603,7 @@
             "h": 10,
             "w": 12,
             "x": 12,
-            "y": 55
+            "y": 68
           },
           "id": 8,
           "legend": {
@@ -3477,7 +3693,7 @@
             "h": 10,
             "w": 12,
             "x": 0,
-            "y": 65
+            "y": 78
           },
           "id": 38,
           "legend": {
@@ -3563,7 +3779,7 @@
             "h": 10,
             "w": 12,
             "x": 12,
-            "y": 65
+            "y": 78
           },
           "id": 39,
           "legend": {
@@ -3643,13 +3859,13 @@
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "${DS_PROMETHEUS}",
+          "datasource": "$datasource",
           "fill": 1,
           "gridPos": {
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 75
+            "y": 88
           },
           "id": 65,
           "legend": {
@@ -3745,13 +3961,13 @@
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "${DS_PROMETHEUS}",
+          "datasource": "$datasource",
           "fill": 1,
           "gridPos": {
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 90
+            "y": 27
           },
           "id": 91,
           "legend": {
@@ -3841,7 +4057,7 @@
             "h": 9,
             "w": 12,
             "x": 12,
-            "y": 90
+            "y": 27
           },
           "id": 21,
           "legend": {
@@ -3920,13 +4136,13 @@
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "${DS_PROMETHEUS}",
+          "datasource": "$datasource",
           "fill": 1,
           "gridPos": {
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 99
+            "y": 36
           },
           "id": 89,
           "legend": {
@@ -4006,13 +4222,13 @@
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "${DS_PROMETHEUS}",
+          "datasource": "$datasource",
           "fill": 1,
           "gridPos": {
             "h": 9,
             "w": 12,
             "x": 12,
-            "y": 99
+            "y": 36
           },
           "id": 93,
           "legend": {
@@ -4027,7 +4243,7 @@
           "lines": true,
           "linewidth": 1,
           "links": [],
-          "nullPointMode": "null",
+          "nullPointMode": "connected",
           "percentage": false,
           "pointradius": 5,
           "points": false,
@@ -4090,13 +4306,13 @@
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "${DS_PROMETHEUS}",
+          "datasource": "$datasource",
           "fill": 1,
           "gridPos": {
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 108
+            "y": 45
           },
           "id": 95,
           "legend": {
@@ -4189,7 +4405,7 @@
             "h": 9,
             "w": 12,
             "x": 12,
-            "y": 108
+            "y": 45
           },
           "heatmap": {},
           "highlightCards": true,
@@ -4251,13 +4467,13 @@
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "${DS_PROMETHEUS}",
+          "datasource": "$datasource",
           "fill": 1,
           "gridPos": {
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 19
+            "y": 97
           },
           "id": 2,
           "legend": {
@@ -4357,20 +4573,24 @@
               "min": null,
               "show": true
             }
-          ]
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
         },
         {
           "aliasColors": {},
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "${DS_PROMETHEUS}",
+          "datasource": "$datasource",
           "fill": 1,
           "gridPos": {
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 19
+            "y": 97
           },
           "id": 41,
           "legend": {
@@ -4439,20 +4659,24 @@
               "min": null,
               "show": true
             }
-          ]
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
         },
         {
           "aliasColors": {},
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "${DS_PROMETHEUS}",
+          "datasource": "$datasource",
           "fill": 1,
           "gridPos": {
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 26
+            "y": 104
           },
           "id": 42,
           "legend": {
@@ -4520,20 +4744,24 @@
               "min": null,
               "show": true
             }
-          ]
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
         },
         {
           "aliasColors": {},
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "${DS_PROMETHEUS}",
+          "datasource": "$datasource",
           "fill": 1,
           "gridPos": {
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 26
+            "y": 104
           },
           "id": 43,
           "legend": {
@@ -4601,7 +4829,11 @@
               "min": null,
               "show": true
             }
-          ]
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
         }
       ],
       "repeat": null,
@@ -4623,7 +4855,7 @@
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "${DS_PROMETHEUS}",
+          "datasource": "$datasource",
           "fill": 1,
           "gridPos": {
             "h": 9,
@@ -4644,7 +4876,7 @@
           "lines": true,
           "linewidth": 1,
           "links": [],
-          "nullPointMode": "null",
+          "nullPointMode": "connected",
           "percentage": false,
           "pointradius": 5,
           "points": false,
@@ -4708,7 +4940,7 @@
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "${DS_PROMETHEUS}",
+          "datasource": "$datasource",
           "fill": 1,
           "gridPos": {
             "h": 9,
@@ -4729,7 +4961,7 @@
           "lines": true,
           "linewidth": 1,
           "links": [],
-          "nullPointMode": "null",
+          "nullPointMode": "connected",
           "percentage": false,
           "pointradius": 5,
           "points": false,
@@ -4856,9 +5088,19 @@
             "selected": false,
             "text": "5m",
             "value": "5m"
+          },
+          {
+            "selected": false,
+            "text": "10m",
+            "value": "10m"
+          },
+          {
+            "selected": false,
+            "text": "15m",
+            "value": "15m"
           }
         ],
-        "query": "30s,1m,2m,5m",
+        "query": "30s,1m,2m,5m,10m,15m",
         "refresh": 2,
         "type": "interval"
       },
@@ -4872,7 +5114,7 @@
         "multi": false,
         "name": "instance",
         "options": [],
-        "query": "label_values(process_cpu_user_seconds_total{job=~\"synapse.*\"}, instance)",
+        "query": "label_values(synapse_util_metrics_block_ru_utime_seconds, instance)",
         "refresh": 2,
         "regex": "",
         "sort": 0,
@@ -4895,7 +5137,7 @@
         "multiFormat": "regex values",
         "name": "job",
         "options": [],
-        "query": "label_values(process_cpu_user_seconds_total{job=~\"synapse.*\"}, job)",
+        "query": "label_values(synapse_util_metrics_block_ru_utime_seconds, job)",
         "refresh": 2,
         "refresh_on_load": false,
         "regex": "",
@@ -4919,7 +5161,7 @@
         "multiFormat": "regex values",
         "name": "index",
         "options": [],
-        "query": "label_values(process_cpu_user_seconds_total{job=~\"synapse.*\"}, index)",
+        "query": "label_values(synapse_util_metrics_block_ru_utime_seconds, index)",
         "refresh": 2,
         "refresh_on_load": false,
         "regex": "",
@@ -4965,5 +5207,5 @@
   "timezone": "",
   "title": "Synapse",
   "uid": "000000012",
-  "version": 127
+  "version": 3
 }
\ No newline at end of file
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 20d3fe3bd8..1d00defc2d 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,4 +1,5 @@
-FROM docker.io/python:2-alpine3.8
+ARG PYTHON_VERSION=2
+FROM docker.io/python:${PYTHON_VERSION}-alpine3.8
 
 COPY . /synapse
 
diff --git a/docker/Dockerfile-pgtests b/docker/Dockerfile-pgtests
new file mode 100644
index 0000000000..7da8eeb9eb
--- /dev/null
+++ b/docker/Dockerfile-pgtests
@@ -0,0 +1,12 @@
+# Use the Sytest image that comes with a lot of the build dependencies
+# pre-installed
+FROM matrixdotorg/sytest:latest
+
+# The Sytest image doesn't come with python, so install that
+RUN apt-get -qq install -y python python-dev python-pip
+
+# We need tox to run the tests in run_pg_tests.sh
+RUN pip install tox
+
+ADD run_pg_tests.sh /pg_tests.sh
+ENTRYPOINT /pg_tests.sh
diff --git a/docker/run_pg_tests.sh b/docker/run_pg_tests.sh
new file mode 100755
index 0000000000..e77424c41a
--- /dev/null
+++ b/docker/run_pg_tests.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# This script runs the PostgreSQL tests inside a Docker container. It expects
+# the relevant source files to be mounted into /src (done automatically by the
+# caller script). It will set up the database, run it, and then use the tox
+# configuration to run the tests.
+
+set -e
+
+# Set PGUSER so Synapse's tests know what user to connect to the database with
+export PGUSER=postgres
+
+# Initialise & start the database
+su -c '/usr/lib/postgresql/9.6/bin/initdb -D /var/lib/postgresql/data -E "UTF-8" --lc-collate="en_US.UTF-8" --lc-ctype="en_US.UTF-8" --username=postgres' postgres
+su -c '/usr/lib/postgresql/9.6/bin/pg_ctl -w -D /var/lib/postgresql/data start' postgres
+
+# Run the tests
+cd /src
+export TRIAL_FLAGS="-j 4"
+tox --workdir=/tmp -e py27-postgres
diff --git a/docker/start.py b/docker/start.py
index 90e8b9c51a..346df8c87f 100755
--- a/docker/start.py
+++ b/docker/start.py
@@ -5,6 +5,7 @@ import os
 import sys
 import subprocess
 import glob
+import codecs
 
 # Utility functions
 convert = lambda src, dst, environ: open(dst, "w").write(jinja2.Template(open(src).read()).render(**environ))
@@ -23,7 +24,7 @@ def generate_secrets(environ, secrets):
                 with open(filename) as handle: value = handle.read()
             else:
                 print("Generating a random secret for {}".format(name))
-                value = os.urandom(32).encode("hex")
+                value = codecs.encode(os.urandom(32), "hex").decode()
                 with open(filename, "w") as handle: handle.write(value)
             environ[secret] = value
 
diff --git a/scripts-dev/dump_macaroon.py b/scripts-dev/dump_macaroon.py
index 6e45be75d6..fcc5568835 100755
--- a/scripts-dev/dump_macaroon.py
+++ b/scripts-dev/dump_macaroon.py
@@ -21,4 +21,4 @@ try:
     verifier.verify(macaroon, key)
     print "Signature is correct"
 except Exception as e:
-    print e.message
+    print str(e)
diff --git a/scripts-dev/next_github_number.sh b/scripts-dev/next_github_number.sh
new file mode 100755
index 0000000000..376280025a
--- /dev/null
+++ b/scripts-dev/next_github_number.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+set -e
+
+# Fetch the current GitHub issue number, add one to it -- presto! The likely
+# next PR number.
+CURRENT_NUMBER=`curl -s "https://api.github.com/repos/matrix-org/synapse/issues?state=all&per_page=1" | jq -r ".[0].number"`
+CURRENT_NUMBER=$((CURRENT_NUMBER+1))
+echo $CURRENT_NUMBER
\ No newline at end of file
diff --git a/synapse/__init__.py b/synapse/__init__.py
index 9dbe0b9f10..b1f7a89fba 100644
--- a/synapse/__init__.py
+++ b/synapse/__init__.py
@@ -27,4 +27,4 @@ try:
 except ImportError:
     pass
 
-__version__ = "0.33.4"
+__version__ = "0.33.5.1"
diff --git a/synapse/api/filtering.py b/synapse/api/filtering.py
index a31a9a17e0..eed8c67e6a 100644
--- a/synapse/api/filtering.py
+++ b/synapse/api/filtering.py
@@ -226,7 +226,7 @@ class Filtering(object):
             jsonschema.validate(user_filter_json, USER_FILTER_SCHEMA,
                                 format_checker=FormatChecker())
         except jsonschema.ValidationError as e:
-            raise SynapseError(400, e.message)
+            raise SynapseError(400, str(e))
 
 
 class FilterCollection(object):
diff --git a/synapse/app/__init__.py b/synapse/app/__init__.py
index 3b6b9368b8..c3afcc573b 100644
--- a/synapse/app/__init__.py
+++ b/synapse/app/__init__.py
@@ -24,7 +24,7 @@ try:
     python_dependencies.check_requirements()
 except python_dependencies.MissingRequirementError as e:
     message = "\n".join([
-        "Missing Requirement: %s" % (e.message,),
+        "Missing Requirement: %s" % (str(e),),
         "To install run:",
         "    pip install --upgrade --force \"%s\"" % (e.dependency,),
         "",
diff --git a/synapse/app/appservice.py b/synapse/app/appservice.py
index 86b5067400..8559e141af 100644
--- a/synapse/app/appservice.py
+++ b/synapse/app/appservice.py
@@ -136,7 +136,7 @@ def start(config_options):
             "Synapse appservice", config_options
         )
     except ConfigError as e:
-        sys.stderr.write("\n" + e.message + "\n")
+        sys.stderr.write("\n" + str(e) + "\n")
         sys.exit(1)
 
     assert config.worker_app == "synapse.app.appservice"
@@ -172,7 +172,6 @@ def start(config_options):
 
     def start():
         ps.get_datastore().start_profiling()
-        ps.get_state_handler().start_caching()
 
     reactor.callWhenRunning(start)
 
diff --git a/synapse/app/client_reader.py b/synapse/app/client_reader.py
index ce2b113dbb..76aed8c60a 100644
--- a/synapse/app/client_reader.py
+++ b/synapse/app/client_reader.py
@@ -153,7 +153,7 @@ def start(config_options):
             "Synapse client reader", config_options
         )
     except ConfigError as e:
-        sys.stderr.write("\n" + e.message + "\n")
+        sys.stderr.write("\n" + str(e) + "\n")
         sys.exit(1)
 
     assert config.worker_app == "synapse.app.client_reader"
@@ -181,7 +181,6 @@ def start(config_options):
     ss.start_listening(config.worker_listeners)
 
     def start():
-        ss.get_state_handler().start_caching()
         ss.get_datastore().start_profiling()
 
     reactor.callWhenRunning(start)
diff --git a/synapse/app/event_creator.py b/synapse/app/event_creator.py
index f98e456ea0..9060ab14f6 100644
--- a/synapse/app/event_creator.py
+++ b/synapse/app/event_creator.py
@@ -169,7 +169,7 @@ def start(config_options):
             "Synapse event creator", config_options
         )
     except ConfigError as e:
-        sys.stderr.write("\n" + e.message + "\n")
+        sys.stderr.write("\n" + str(e) + "\n")
         sys.exit(1)
 
     assert config.worker_app == "synapse.app.event_creator"
@@ -199,7 +199,6 @@ def start(config_options):
     ss.start_listening(config.worker_listeners)
 
     def start():
-        ss.get_state_handler().start_caching()
         ss.get_datastore().start_profiling()
 
     reactor.callWhenRunning(start)
diff --git a/synapse/app/federation_reader.py b/synapse/app/federation_reader.py
index 60f5973505..228a297fb8 100644
--- a/synapse/app/federation_reader.py
+++ b/synapse/app/federation_reader.py
@@ -140,7 +140,7 @@ def start(config_options):
             "Synapse federation reader", config_options
         )
     except ConfigError as e:
-        sys.stderr.write("\n" + e.message + "\n")
+        sys.stderr.write("\n" + str(e) + "\n")
         sys.exit(1)
 
     assert config.worker_app == "synapse.app.federation_reader"
@@ -168,7 +168,6 @@ def start(config_options):
     ss.start_listening(config.worker_listeners)
 
     def start():
-        ss.get_state_handler().start_caching()
         ss.get_datastore().start_profiling()
 
     reactor.callWhenRunning(start)
diff --git a/synapse/app/federation_sender.py b/synapse/app/federation_sender.py
index 60dd09aac3..e9a99d76e1 100644
--- a/synapse/app/federation_sender.py
+++ b/synapse/app/federation_sender.py
@@ -160,7 +160,7 @@ def start(config_options):
             "Synapse federation sender", config_options
         )
     except ConfigError as e:
-        sys.stderr.write("\n" + e.message + "\n")
+        sys.stderr.write("\n" + str(e) + "\n")
         sys.exit(1)
 
     assert config.worker_app == "synapse.app.federation_sender"
@@ -201,7 +201,6 @@ def start(config_options):
 
     def start():
         ps.get_datastore().start_profiling()
-        ps.get_state_handler().start_caching()
 
     reactor.callWhenRunning(start)
     _base.start_worker_reactor("synapse-federation-sender", config)
diff --git a/synapse/app/frontend_proxy.py b/synapse/app/frontend_proxy.py
index 8c0b9c67b0..fc4b25de1c 100644
--- a/synapse/app/frontend_proxy.py
+++ b/synapse/app/frontend_proxy.py
@@ -228,7 +228,7 @@ def start(config_options):
             "Synapse frontend proxy", config_options
         )
     except ConfigError as e:
-        sys.stderr.write("\n" + e.message + "\n")
+        sys.stderr.write("\n" + str(e) + "\n")
         sys.exit(1)
 
     assert config.worker_app == "synapse.app.frontend_proxy"
@@ -258,7 +258,6 @@ def start(config_options):
     ss.start_listening(config.worker_listeners)
 
     def start():
-        ss.get_state_handler().start_caching()
         ss.get_datastore().start_profiling()
 
     reactor.callWhenRunning(start)
diff --git a/synapse/app/homeserver.py b/synapse/app/homeserver.py
index 3241ded188..a98fdbd210 100755
--- a/synapse/app/homeserver.py
+++ b/synapse/app/homeserver.py
@@ -301,7 +301,7 @@ class SynapseHomeServer(HomeServer):
         try:
             database_engine.check_database(db_conn.cursor())
         except IncorrectDatabaseSetup as e:
-            quit_with_error(e.message)
+            quit_with_error(str(e))
 
 
 # Gauges to expose monthly active user control metrics
@@ -328,7 +328,7 @@ def setup(config_options):
             config_options,
         )
     except ConfigError as e:
-        sys.stderr.write("\n" + e.message + "\n")
+        sys.stderr.write("\n" + str(e) + "\n")
         sys.exit(1)
 
     if not config:
@@ -384,7 +384,6 @@ def setup(config_options):
 
     def start():
         hs.get_pusherpool().start()
-        hs.get_state_handler().start_caching()
         hs.get_datastore().start_profiling()
         hs.get_datastore().start_doing_background_updates()
         hs.get_federation_client().start_get_pdu_cache()
diff --git a/synapse/app/media_repository.py b/synapse/app/media_repository.py
index e3dbb3b4e6..acc0487adc 100644
--- a/synapse/app/media_repository.py
+++ b/synapse/app/media_repository.py
@@ -133,7 +133,7 @@ def start(config_options):
             "Synapse media repository", config_options
         )
     except ConfigError as e:
-        sys.stderr.write("\n" + e.message + "\n")
+        sys.stderr.write("\n" + str(e) + "\n")
         sys.exit(1)
 
     assert config.worker_app == "synapse.app.media_repository"
@@ -168,7 +168,6 @@ def start(config_options):
     ss.start_listening(config.worker_listeners)
 
     def start():
-        ss.get_state_handler().start_caching()
         ss.get_datastore().start_profiling()
 
     reactor.callWhenRunning(start)
diff --git a/synapse/app/pusher.py b/synapse/app/pusher.py
index 244c604de9..630dcda478 100644
--- a/synapse/app/pusher.py
+++ b/synapse/app/pusher.py
@@ -191,7 +191,7 @@ def start(config_options):
             "Synapse pusher", config_options
         )
     except ConfigError as e:
-        sys.stderr.write("\n" + e.message + "\n")
+        sys.stderr.write("\n" + str(e) + "\n")
         sys.exit(1)
 
     assert config.worker_app == "synapse.app.pusher"
@@ -228,7 +228,6 @@ def start(config_options):
     def start():
         ps.get_pusherpool().start()
         ps.get_datastore().start_profiling()
-        ps.get_state_handler().start_caching()
 
     reactor.callWhenRunning(start)
 
diff --git a/synapse/app/synchrotron.py b/synapse/app/synchrotron.py
index 6662340797..9a7fc6ee9d 100644
--- a/synapse/app/synchrotron.py
+++ b/synapse/app/synchrotron.py
@@ -410,7 +410,7 @@ def start(config_options):
             "Synapse synchrotron", config_options
         )
     except ConfigError as e:
-        sys.stderr.write("\n" + e.message + "\n")
+        sys.stderr.write("\n" + str(e) + "\n")
         sys.exit(1)
 
     assert config.worker_app == "synapse.app.synchrotron"
@@ -435,7 +435,6 @@ def start(config_options):
 
     def start():
         ss.get_datastore().start_profiling()
-        ss.get_state_handler().start_caching()
 
     reactor.callWhenRunning(start)
 
diff --git a/synapse/app/synctl.py b/synapse/app/synctl.py
deleted file mode 100755
index d658f967ba..0000000000
--- a/synapse/app/synctl.py
+++ /dev/null
@@ -1,284 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# Copyright 2014-2016 OpenMarket Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import collections
-import errno
-import glob
-import os
-import os.path
-import signal
-import subprocess
-import sys
-import time
-
-from six import iteritems
-
-import yaml
-
-SYNAPSE = [sys.executable, "-B", "-m", "synapse.app.homeserver"]
-
-GREEN = "\x1b[1;32m"
-YELLOW = "\x1b[1;33m"
-RED = "\x1b[1;31m"
-NORMAL = "\x1b[m"
-
-
-def pid_running(pid):
-    try:
-        os.kill(pid, 0)
-        return True
-    except OSError as err:
-        if err.errno == errno.EPERM:
-            return True
-        return False
-
-
-def write(message, colour=NORMAL, stream=sys.stdout):
-    if colour == NORMAL:
-        stream.write(message + "\n")
-    else:
-        stream.write(colour + message + NORMAL + "\n")
-
-
-def abort(message, colour=RED, stream=sys.stderr):
-    write(message, colour, stream)
-    sys.exit(1)
-
-
-def start(configfile):
-    write("Starting ...")
-    args = SYNAPSE
-    args.extend(["--daemonize", "-c", configfile])
-
-    try:
-        subprocess.check_call(args)
-        write("started synapse.app.homeserver(%r)" %
-              (configfile,), colour=GREEN)
-    except subprocess.CalledProcessError as e:
-        write(
-            "error starting (exit code: %d); see above for logs" % e.returncode,
-            colour=RED,
-        )
-
-
-def start_worker(app, configfile, worker_configfile):
-    args = [
-        "python", "-B",
-        "-m", app,
-        "-c", configfile,
-        "-c", worker_configfile
-    ]
-
-    try:
-        subprocess.check_call(args)
-        write("started %s(%r)" % (app, worker_configfile), colour=GREEN)
-    except subprocess.CalledProcessError as e:
-        write(
-            "error starting %s(%r) (exit code: %d); see above for logs" % (
-                app, worker_configfile, e.returncode,
-            ),
-            colour=RED,
-        )
-
-
-def stop(pidfile, app):
-    if os.path.exists(pidfile):
-        pid = int(open(pidfile).read())
-        try:
-            os.kill(pid, signal.SIGTERM)
-            write("stopped %s" % (app,), colour=GREEN)
-        except OSError as err:
-            if err.errno == errno.ESRCH:
-                write("%s not running" % (app,), colour=YELLOW)
-            elif err.errno == errno.EPERM:
-                abort("Cannot stop %s: Operation not permitted" % (app,))
-            else:
-                abort("Cannot stop %s: Unknown error" % (app,))
-
-
-Worker = collections.namedtuple("Worker", [
-    "app", "configfile", "pidfile", "cache_factor"
-])
-
-
-def main():
-
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "action",
-        choices=["start", "stop", "restart"],
-        help="whether to start, stop or restart the synapse",
-    )
-    parser.add_argument(
-        "configfile",
-        nargs="?",
-        default="homeserver.yaml",
-        help="the homeserver config file, defaults to homeserver.yaml",
-    )
-    parser.add_argument(
-        "-w", "--worker",
-        metavar="WORKERCONFIG",
-        help="start or stop a single worker",
-    )
-    parser.add_argument(
-        "-a", "--all-processes",
-        metavar="WORKERCONFIGDIR",
-        help="start or stop all the workers in the given directory"
-             " and the main synapse process",
-    )
-
-    options = parser.parse_args()
-
-    if options.worker and options.all_processes:
-        write(
-            'Cannot use "--worker" with "--all-processes"',
-            stream=sys.stderr
-        )
-        sys.exit(1)
-
-    configfile = options.configfile
-
-    if not os.path.exists(configfile):
-        write(
-            "No config file found\n"
-            "To generate a config file, run '%s -c %s --generate-config"
-            " --server-name=<server name>'\n" % (
-                " ".join(SYNAPSE), options.configfile
-            ),
-            stream=sys.stderr,
-        )
-        sys.exit(1)
-
-    with open(configfile) as stream:
-        config = yaml.load(stream)
-
-    pidfile = config["pid_file"]
-    cache_factor = config.get("synctl_cache_factor")
-    start_stop_synapse = True
-
-    if cache_factor:
-        os.environ["SYNAPSE_CACHE_FACTOR"] = str(cache_factor)
-
-    cache_factors = config.get("synctl_cache_factors", {})
-    for cache_name, factor in iteritems(cache_factors):
-        os.environ["SYNAPSE_CACHE_FACTOR_" + cache_name.upper()] = str(factor)
-
-    worker_configfiles = []
-    if options.worker:
-        start_stop_synapse = False
-        worker_configfile = options.worker
-        if not os.path.exists(worker_configfile):
-            write(
-                "No worker config found at %r" % (worker_configfile,),
-                stream=sys.stderr,
-            )
-            sys.exit(1)
-        worker_configfiles.append(worker_configfile)
-
-    if options.all_processes:
-        # To start the main synapse with -a you need to add a worker file
-        # with worker_app == "synapse.app.homeserver"
-        start_stop_synapse = False
-        worker_configdir = options.all_processes
-        if not os.path.isdir(worker_configdir):
-            write(
-                "No worker config directory found at %r" % (worker_configdir,),
-                stream=sys.stderr,
-            )
-            sys.exit(1)
-        worker_configfiles.extend(sorted(glob.glob(
-            os.path.join(worker_configdir, "*.yaml")
-        )))
-
-    workers = []
-    for worker_configfile in worker_configfiles:
-        with open(worker_configfile) as stream:
-            worker_config = yaml.load(stream)
-        worker_app = worker_config["worker_app"]
-        if worker_app == "synapse.app.homeserver":
-            # We need to special case all of this to pick up options that may
-            # be set in the main config file or in this worker config file.
-            worker_pidfile = (
-                worker_config.get("pid_file")
-                or pidfile
-            )
-            worker_cache_factor = worker_config.get("synctl_cache_factor") or cache_factor
-            daemonize = worker_config.get("daemonize") or config.get("daemonize")
-            assert daemonize, "Main process must have daemonize set to true"
-
-            # The master process doesn't support using worker_* config.
-            for key in worker_config:
-                if key == "worker_app":  # But we allow worker_app
-                    continue
-                assert not key.startswith("worker_"), \
-                    "Main process cannot use worker_* config"
-        else:
-            worker_pidfile = worker_config["worker_pid_file"]
-            worker_daemonize = worker_config["worker_daemonize"]
-            assert worker_daemonize, "In config %r: expected '%s' to be True" % (
-                worker_configfile, "worker_daemonize")
-            worker_cache_factor = worker_config.get("synctl_cache_factor")
-        workers.append(Worker(
-            worker_app, worker_configfile, worker_pidfile, worker_cache_factor,
-        ))
-
-    action = options.action
-
-    if action == "stop" or action == "restart":
-        for worker in workers:
-            stop(worker.pidfile, worker.app)
-
-        if start_stop_synapse:
-            stop(pidfile, "synapse.app.homeserver")
-
-    # Wait for synapse to actually shutdown before starting it again
-    if action == "restart":
-        running_pids = []
-        if start_stop_synapse and os.path.exists(pidfile):
-            running_pids.append(int(open(pidfile).read()))
-        for worker in workers:
-            if os.path.exists(worker.pidfile):
-                running_pids.append(int(open(worker.pidfile).read()))
-        if len(running_pids) > 0:
-            write("Waiting for process to exit before restarting...")
-            for running_pid in running_pids:
-                while pid_running(running_pid):
-                    time.sleep(0.2)
-            write("All processes exited; now restarting...")
-
-    if action == "start" or action == "restart":
-        if start_stop_synapse:
-            # Check if synapse is already running
-            if os.path.exists(pidfile) and pid_running(int(open(pidfile).read())):
-                abort("synapse.app.homeserver already running")
-            start(configfile)
-
-        for worker in workers:
-            if worker.cache_factor:
-                os.environ["SYNAPSE_CACHE_FACTOR"] = str(worker.cache_factor)
-
-            start_worker(worker.app, configfile, worker.configfile)
-
-            if cache_factor:
-                os.environ["SYNAPSE_CACHE_FACTOR"] = str(cache_factor)
-            else:
-                os.environ.pop("SYNAPSE_CACHE_FACTOR", None)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/synapse/app/user_dir.py b/synapse/app/user_dir.py
index 96ffcaf073..0a5f62b509 100644
--- a/synapse/app/user_dir.py
+++ b/synapse/app/user_dir.py
@@ -188,7 +188,7 @@ def start(config_options):
             "Synapse user directory", config_options
         )
     except ConfigError as e:
-        sys.stderr.write("\n" + e.message + "\n")
+        sys.stderr.write("\n" + str(e) + "\n")
         sys.exit(1)
 
     assert config.worker_app == "synapse.app.user_dir"
@@ -229,7 +229,6 @@ def start(config_options):
 
     def start():
         ps.get_datastore().start_profiling()
-        ps.get_state_handler().start_caching()
 
     reactor.callWhenRunning(start)
 
diff --git a/synapse/config/__main__.py b/synapse/config/__main__.py
index 58c97a70af..8fccf573ee 100644
--- a/synapse/config/__main__.py
+++ b/synapse/config/__main__.py
@@ -25,7 +25,7 @@ if __name__ == "__main__":
         try:
             config = HomeServerConfig.load_config("", sys.argv[3:])
         except ConfigError as e:
-            sys.stderr.write("\n" + e.message + "\n")
+            sys.stderr.write("\n" + str(e) + "\n")
             sys.exit(1)
 
         print (getattr(config, key))
diff --git a/synapse/federation/federation_client.py b/synapse/federation/federation_client.py
index fe67b2ff42..d05ed91d64 100644
--- a/synapse/federation/federation_client.py
+++ b/synapse/federation/federation_client.py
@@ -66,6 +66,14 @@ class FederationClient(FederationBase):
         self.state = hs.get_state_handler()
         self.transport_layer = hs.get_federation_transport_client()
 
+        self._get_pdu_cache = ExpiringCache(
+            cache_name="get_pdu_cache",
+            clock=self._clock,
+            max_len=1000,
+            expiry_ms=120 * 1000,
+            reset_expiry_on_get=False,
+        )
+
     def _clear_tried_cache(self):
         """Clear pdu_destination_tried cache"""
         now = self._clock.time_msec()
@@ -82,17 +90,6 @@ class FederationClient(FederationBase):
             if destination_dict:
                 self.pdu_destination_tried[event_id] = destination_dict
 
-    def start_get_pdu_cache(self):
-        self._get_pdu_cache = ExpiringCache(
-            cache_name="get_pdu_cache",
-            clock=self._clock,
-            max_len=1000,
-            expiry_ms=120 * 1000,
-            reset_expiry_on_get=False,
-        )
-
-        self._get_pdu_cache.start()
-
     @log_function
     def make_query(self, destination, query_type, args,
                    retry_on_dns_fail=False, ignore_backoff=False):
@@ -212,8 +209,6 @@ class FederationClient(FederationBase):
         Will attempt to get the PDU from each destination in the list until
         one succeeds.
 
-        This will persist the PDU locally upon receipt.
-
         Args:
             destinations (list): Which home servers to query
             event_id (str): event to fetch
@@ -229,10 +224,9 @@ class FederationClient(FederationBase):
 
         # TODO: Rate limit the number of times we try and get the same event.
 
-        if self._get_pdu_cache:
-            ev = self._get_pdu_cache.get(event_id)
-            if ev:
-                defer.returnValue(ev)
+        ev = self._get_pdu_cache.get(event_id)
+        if ev:
+            defer.returnValue(ev)
 
         pdu_attempts = self.pdu_destination_tried.setdefault(event_id, {})
 
@@ -285,7 +279,7 @@ class FederationClient(FederationBase):
                 )
                 continue
 
-        if self._get_pdu_cache is not None and signed_pdu:
+        if signed_pdu:
             self._get_pdu_cache[event_id] = signed_pdu
 
         defer.returnValue(signed_pdu)
@@ -293,8 +287,7 @@ class FederationClient(FederationBase):
     @defer.inlineCallbacks
     @log_function
     def get_state_for_room(self, destination, room_id, event_id):
-        """Requests all of the `current` state PDUs for a given room from
-        a remote home server.
+        """Requests all of the room state at a given event from a remote home server.
 
         Args:
             destination (str): The remote homeserver to query for the state.
@@ -302,9 +295,10 @@ class FederationClient(FederationBase):
             event_id (str): The id of the event we want the state at.
 
         Returns:
-            Deferred: Results in a list of PDUs.
+            Deferred[Tuple[List[EventBase], List[EventBase]]]:
+                A list of events in the state, and a list of events in the auth chain
+                for the given event.
         """
-
         try:
             # First we try and ask for just the IDs, as thats far quicker if
             # we have most of the state and auth_chain already.
diff --git a/synapse/federation/federation_server.py b/synapse/federation/federation_server.py
index dbee404ea7..819e8f7331 100644
--- a/synapse/federation/federation_server.py
+++ b/synapse/federation/federation_server.py
@@ -46,6 +46,7 @@ from synapse.replication.http.federation import (
 from synapse.types import get_domain_from_id
 from synapse.util.async_helpers import Linearizer, concurrently_execute
 from synapse.util.caches.response_cache import ResponseCache
+from synapse.util.logcontext import nested_logging_context
 from synapse.util.logutils import log_function
 
 # when processing incoming transactions, we try to handle multiple rooms in
@@ -187,21 +188,22 @@ class FederationServer(FederationBase):
 
             for pdu in pdus_by_room[room_id]:
                 event_id = pdu.event_id
-                try:
-                    yield self._handle_received_pdu(
-                        origin, pdu
-                    )
-                    pdu_results[event_id] = {}
-                except FederationError as e:
-                    logger.warn("Error handling PDU %s: %s", event_id, e)
-                    pdu_results[event_id] = {"error": str(e)}
-                except Exception as e:
-                    f = failure.Failure()
-                    pdu_results[event_id] = {"error": str(e)}
-                    logger.error(
-                        "Failed to handle PDU %s: %s",
-                        event_id, f.getTraceback().rstrip(),
-                    )
+                with nested_logging_context(event_id):
+                    try:
+                        yield self._handle_received_pdu(
+                            origin, pdu
+                        )
+                        pdu_results[event_id] = {}
+                    except FederationError as e:
+                        logger.warn("Error handling PDU %s: %s", event_id, e)
+                        pdu_results[event_id] = {"error": str(e)}
+                    except Exception as e:
+                        f = failure.Failure()
+                        pdu_results[event_id] = {"error": str(e)}
+                        logger.error(
+                            "Failed to handle PDU %s: %s",
+                            event_id, f.getTraceback().rstrip(),
+                        )
 
         yield concurrently_execute(
             process_pdus_for_room, pdus_by_room.keys(),
@@ -618,7 +620,7 @@ class FederationServer(FederationBase):
             )
 
         yield self.handler.on_receive_pdu(
-            origin, pdu, get_missing=True, sent_to_us_directly=True,
+            origin, pdu, sent_to_us_directly=True,
         )
 
     def __str__(self):
diff --git a/synapse/handlers/e2e_keys.py b/synapse/handlers/e2e_keys.py
index 578e9250fb..9dc46aa15f 100644
--- a/synapse/handlers/e2e_keys.py
+++ b/synapse/handlers/e2e_keys.py
@@ -341,7 +341,7 @@ class E2eKeysHandler(object):
 def _exception_to_failure(e):
     if isinstance(e, CodeMessageException):
         return {
-            "status": e.code, "message": e.message,
+            "status": e.code, "message": str(e),
         }
 
     if isinstance(e, NotRetryingDestination):
diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py
index f10b46414b..38bebbf598 100644
--- a/synapse/handlers/federation.py
+++ b/synapse/handlers/federation.py
@@ -69,6 +69,27 @@ from ._base import BaseHandler
 logger = logging.getLogger(__name__)
 
 
+def shortstr(iterable, maxitems=5):
+    """If iterable has maxitems or fewer, return the stringification of a list
+    containing those items.
+
+    Otherwise, return the stringification of a a list with the first maxitems items,
+    followed by "...".
+
+    Args:
+        iterable (Iterable): iterable to truncate
+        maxitems (int): number of items to return before truncating
+
+    Returns:
+        unicode
+    """
+
+    items = list(itertools.islice(iterable, maxitems + 1))
+    if len(items) <= maxitems:
+        return str(items)
+    return u"[" + u", ".join(repr(r) for r in items[:maxitems]) + u", ...]"
+
+
 class FederationHandler(BaseHandler):
     """Handles events that originated from federation.
         Responsible for:
@@ -114,9 +135,8 @@ class FederationHandler(BaseHandler):
         self._room_pdu_linearizer = Linearizer("fed_room_pdu")
 
     @defer.inlineCallbacks
-    @log_function
     def on_receive_pdu(
-            self, origin, pdu, get_missing=True, sent_to_us_directly=False,
+            self, origin, pdu, sent_to_us_directly=False,
     ):
         """ Process a PDU received via a federation /send/ transaction, or
         via backfill of missing prev_events
@@ -125,14 +145,23 @@ class FederationHandler(BaseHandler):
             origin (str): server which initiated the /send/ transaction. Will
                 be used to fetch missing events or state.
             pdu (FrozenEvent): received PDU
-            get_missing (bool): True if we should fetch missing prev_events
+            sent_to_us_directly (bool): True if this event was pushed to us; False if
+                we pulled it as the result of a missing prev_event.
 
         Returns (Deferred): completes with None
         """
 
+        room_id = pdu.room_id
+        event_id = pdu.event_id
+
+        logger.info(
+            "[%s %s] handling received PDU: %s",
+            room_id, event_id, pdu,
+        )
+
         # We reprocess pdus when we have seen them only as outliers
         existing = yield self.store.get_event(
-            pdu.event_id,
+            event_id,
             allow_none=True,
             allow_rejected=True,
         )
@@ -147,7 +176,7 @@ class FederationHandler(BaseHandler):
             )
         )
         if already_seen:
-            logger.debug("Already seen pdu %s", pdu.event_id)
+            logger.debug("[%s %s]: Already seen pdu", room_id, event_id)
             return
 
         # do some initial sanity-checking of the event. In particular, make
@@ -156,6 +185,7 @@ class FederationHandler(BaseHandler):
         try:
             self._sanity_check_event(pdu)
         except SynapseError as err:
+            logger.warn("[%s %s] Received event failed sanity checks", room_id, event_id)
             raise FederationError(
                 "ERROR",
                 err.code,
@@ -165,10 +195,12 @@ class FederationHandler(BaseHandler):
 
         # If we are currently in the process of joining this room, then we
         # queue up events for later processing.
-        if pdu.room_id in self.room_queues:
-            logger.info("Ignoring PDU %s for room %s from %s for now; join "
-                        "in progress", pdu.event_id, pdu.room_id, origin)
-            self.room_queues[pdu.room_id].append((pdu, origin))
+        if room_id in self.room_queues:
+            logger.info(
+                "[%s %s] Queuing PDU from %s for now: join in progress",
+                room_id, event_id, origin,
+            )
+            self.room_queues[room_id].append((pdu, origin))
             return
 
         # If we're no longer in the room just ditch the event entirely. This
@@ -179,7 +211,7 @@ class FederationHandler(BaseHandler):
         # we should check if we *are* in fact in the room. If we are then we
         # can magically rejoin the room.
         is_in_room = yield self.auth.check_host_in_room(
-            pdu.room_id,
+            room_id,
             self.server_name
         )
         if not is_in_room:
@@ -188,8 +220,8 @@ class FederationHandler(BaseHandler):
             )
             if was_in_room:
                 logger.info(
-                    "Ignoring PDU %s for room %s from %s as we've left the room!",
-                    pdu.event_id, pdu.room_id, origin,
+                    "[%s %s] Ignoring PDU from %s as we've left the room",
+                    room_id, event_id, origin,
                 )
                 defer.returnValue(None)
 
@@ -204,8 +236,8 @@ class FederationHandler(BaseHandler):
             )
 
             logger.debug(
-                "_handle_new_pdu min_depth for %s: %d",
-                pdu.room_id, min_depth
+                "[%s %s] min_depth: %d",
+                room_id, event_id, min_depth,
             )
 
             prevs = {e_id for e_id, _ in pdu.prev_events}
@@ -218,17 +250,18 @@ class FederationHandler(BaseHandler):
                 # send to the clients.
                 pdu.internal_metadata.outlier = True
             elif min_depth and pdu.depth > min_depth:
-                if get_missing and prevs - seen:
+                missing_prevs = prevs - seen
+                if sent_to_us_directly and missing_prevs:
                     # If we're missing stuff, ensure we only fetch stuff one
                     # at a time.
                     logger.info(
-                        "Acquiring lock for room %r to fetch %d missing events: %r...",
-                        pdu.room_id, len(prevs - seen), list(prevs - seen)[:5],
+                        "[%s %s] Acquiring room lock to fetch %d missing prev_events: %s",
+                        room_id, event_id, len(missing_prevs), shortstr(missing_prevs),
                     )
                     with (yield self._room_pdu_linearizer.queue(pdu.room_id)):
                         logger.info(
-                            "Acquired lock for room %r to fetch %d missing events",
-                            pdu.room_id, len(prevs - seen),
+                            "[%s %s] Acquired room lock to fetch %d missing prev_events",
+                            room_id, event_id, len(missing_prevs),
                         )
 
                         yield self._get_missing_events_for_pdu(
@@ -241,49 +274,91 @@ class FederationHandler(BaseHandler):
 
                         if not prevs - seen:
                             logger.info(
-                                "Found all missing prev events for %s", pdu.event_id
+                                "[%s %s] Found all missing prev_events",
+                                room_id, event_id,
                             )
-                elif prevs - seen:
+                elif missing_prevs:
                     logger.info(
-                        "Not fetching %d missing events for room %r,event %s: %r...",
-                        len(prevs - seen), pdu.room_id, pdu.event_id,
-                        list(prevs - seen)[:5],
+                        "[%s %s] Not recursively fetching %d missing prev_events: %s",
+                        room_id, event_id, len(missing_prevs), shortstr(missing_prevs),
+                    )
+
+            if prevs - seen:
+                # We've still not been able to get all of the prev_events for this event.
+                #
+                # In this case, we need to fall back to asking another server in the
+                # federation for the state at this event. That's ok provided we then
+                # resolve the state against other bits of the DAG before using it (which
+                # will ensure that you can't just take over a room by sending an event,
+                # withholding its prev_events, and declaring yourself to be an admin in
+                # the subsequent state request).
+                #
+                # Now, if we're pulling this event as a missing prev_event, then clearly
+                # this event is not going to become the only forward-extremity and we are
+                # guaranteed to resolve its state against our existing forward
+                # extremities, so that should be fine.
+                #
+                # On the other hand, if this event was pushed to us, it is possible for
+                # it to become the only forward-extremity in the room, and we would then
+                # trust its state to be the state for the whole room. This is very bad.
+                # Further, if the event was pushed to us, there is no excuse for us not to
+                # have all the prev_events. We therefore reject any such events.
+                #
+                # XXX this really feels like it could/should be merged with the above,
+                # but there is an interaction with min_depth that I'm not really
+                # following.
+
+                if sent_to_us_directly:
+                    logger.warn(
+                        "[%s %s] Failed to fetch %d prev events: rejecting",
+                        room_id, event_id, len(prevs - seen),
+                    )
+                    raise FederationError(
+                        "ERROR",
+                        403,
+                        (
+                            "Your server isn't divulging details about prev_events "
+                            "referenced in this event."
+                        ),
+                        affected=pdu.event_id,
                     )
 
-            if sent_to_us_directly and prevs - seen:
-                # If they have sent it to us directly, and the server
-                # isn't telling us about the auth events that it's
-                # made a message referencing, we explode
-                raise FederationError(
-                    "ERROR",
-                    403,
-                    (
-                        "Your server isn't divulging details about prev_events "
-                        "referenced in this event."
-                    ),
-                    affected=pdu.event_id,
-                )
-            elif prevs - seen:
                 # Calculate the state of the previous events, and
                 # de-conflict them to find the current state.
                 state_groups = []
                 auth_chains = set()
                 try:
                     # Get the state of the events we know about
-                    ours = yield self.store.get_state_groups(pdu.room_id, list(seen))
+                    ours = yield self.store.get_state_groups(room_id, list(seen))
                     state_groups.append(ours)
 
                     # Ask the remote server for the states we don't
                     # know about
                     for p in prevs - seen:
-                        state, got_auth_chain = (
-                            yield self.federation_client.get_state_for_room(
-                                origin, pdu.room_id, p
-                            )
+                        logger.info(
+                            "[%s %s] Requesting state at missing prev_event %s",
+                            room_id, event_id, p,
                         )
-                        auth_chains.update(got_auth_chain)
-                        state_group = {(x.type, x.state_key): x.event_id for x in state}
-                        state_groups.append(state_group)
+
+                        with logcontext.nested_logging_context(p):
+                            # note that if any of the missing prevs share missing state or
+                            # auth events, the requests to fetch those events are deduped
+                            # by the get_pdu_cache in federation_client.
+                            remote_state, got_auth_chain = (
+                                yield self.federation_client.get_state_for_room(
+                                    origin, room_id, p,
+                                )
+                            )
+
+                            # XXX hrm I'm not convinced that duplicate events will compare
+                            # for equality, so I'm not sure this does what the author
+                            # hoped.
+                            auth_chains.update(got_auth_chain)
+
+                            state_group = {
+                                (x.type, x.state_key): x.event_id for x in remote_state
+                            }
+                            state_groups.append(state_group)
 
                     # Resolve any conflicting state
                     def fetch(ev_ids):
@@ -291,19 +366,24 @@ class FederationHandler(BaseHandler):
                             ev_ids, get_prev_content=False, check_redacted=False
                         )
 
-                    room_version = yield self.store.get_room_version(pdu.room_id)
+                    room_version = yield self.store.get_room_version(room_id)
                     state_map = yield resolve_events_with_factory(
-                        room_version, state_groups, {pdu.event_id: pdu}, fetch
+                        room_version, state_groups, {event_id: pdu}, fetch
                     )
 
                     state = (yield self.store.get_events(state_map.values())).values()
                     auth_chain = list(auth_chains)
                 except Exception:
+                    logger.warn(
+                        "[%s %s] Error attempting to resolve state at missing "
+                        "prev_events",
+                        room_id, event_id, exc_info=True,
+                    )
                     raise FederationError(
                         "ERROR",
                         403,
                         "We can't get valid state history.",
-                        affected=pdu.event_id,
+                        affected=event_id,
                     )
 
         yield self._process_received_pdu(
@@ -322,15 +402,16 @@ class FederationHandler(BaseHandler):
             prevs (set(str)): List of event ids which we are missing
             min_depth (int): Minimum depth of events to return.
         """
-        # We recalculate seen, since it may have changed.
+
+        room_id = pdu.room_id
+        event_id = pdu.event_id
+
         seen = yield self.store.have_seen_events(prevs)
 
         if not prevs - seen:
             return
 
-        latest = yield self.store.get_latest_event_ids_in_room(
-            pdu.room_id
-        )
+        latest = yield self.store.get_latest_event_ids_in_room(room_id)
 
         # We add the prev events that we have seen to the latest
         # list to ensure the remote server doesn't give them to us
@@ -338,8 +419,8 @@ class FederationHandler(BaseHandler):
         latest |= seen
 
         logger.info(
-            "Missing %d events for room %r pdu %s: %r...",
-            len(prevs - seen), pdu.room_id, pdu.event_id, list(prevs - seen)[:5]
+            "[%s %s]: Requesting %d prev_events: %s",
+            room_id, event_id, len(prevs - seen), shortstr(prevs - seen)
         )
 
         # XXX: we set timeout to 10s to help workaround
@@ -392,7 +473,7 @@ class FederationHandler(BaseHandler):
 
         missing_events = yield self.federation_client.get_missing_events(
             origin,
-            pdu.room_id,
+            room_id,
             earliest_events_ids=list(latest),
             latest_events=[pdu],
             limit=10,
@@ -401,37 +482,47 @@ class FederationHandler(BaseHandler):
         )
 
         logger.info(
-            "Got %d events: %r...",
-            len(missing_events), [e.event_id for e in missing_events[:5]]
+            "[%s %s]: Got %d prev_events: %s",
+            room_id, event_id, len(missing_events), shortstr(missing_events),
         )
 
         # We want to sort these by depth so we process them and
         # tell clients about them in order.
         missing_events.sort(key=lambda x: x.depth)
 
-        for e in missing_events:
-            logger.info("Handling found event %s", e.event_id)
-            try:
-                yield self.on_receive_pdu(
-                    origin,
-                    e,
-                    get_missing=False
-                )
-            except FederationError as e:
-                if e.code == 403:
-                    logger.warn("Event %s failed history check.")
-                else:
-                    raise
+        for ev in missing_events:
+            logger.info(
+                "[%s %s] Handling received prev_event %s",
+                room_id, event_id, ev.event_id,
+            )
+            with logcontext.nested_logging_context(ev.event_id):
+                try:
+                    yield self.on_receive_pdu(
+                        origin,
+                        ev,
+                        sent_to_us_directly=False,
+                    )
+                except FederationError as e:
+                    if e.code == 403:
+                        logger.warn(
+                            "[%s %s] Received prev_event %s failed history check.",
+                            room_id, event_id, ev.event_id,
+                        )
+                    else:
+                        raise
 
-    @log_function
     @defer.inlineCallbacks
-    def _process_received_pdu(self, origin, pdu, state, auth_chain):
+    def _process_received_pdu(self, origin, event, state, auth_chain):
         """ Called when we have a new pdu. We need to do auth checks and put it
         through the StateHandler.
         """
-        event = pdu
+        room_id = event.room_id
+        event_id = event.event_id
 
-        logger.debug("Processing event: %s", event)
+        logger.debug(
+            "[%s %s] Processing event: %s",
+            room_id, event_id, event,
+        )
 
         # FIXME (erikj): Awful hack to make the case where we are not currently
         # in the room work
@@ -440,15 +531,16 @@ class FederationHandler(BaseHandler):
         # event.
         if state and auth_chain and not event.internal_metadata.is_outlier():
             is_in_room = yield self.auth.check_host_in_room(
-                event.room_id,
+                room_id,
                 self.server_name
             )
         else:
             is_in_room = True
+
         if not is_in_room:
             logger.info(
-                "Got event for room we're not in: %r %r",
-                event.room_id, event.event_id
+                "[%s %s] Got event for room we're not in",
+                room_id, event_id,
             )
 
             try:
@@ -460,7 +552,7 @@ class FederationHandler(BaseHandler):
                     "ERROR",
                     e.code,
                     e.msg,
-                    affected=event.event_id,
+                    affected=event_id,
                 )
 
         else:
@@ -493,6 +585,10 @@ class FederationHandler(BaseHandler):
                     })
                     seen_ids.add(e.event_id)
 
+                logger.info(
+                    "[%s %s] persisting newly-received auth/state events %s",
+                    room_id, event_id, [e["event"].event_id for e in event_infos]
+                )
                 yield self._handle_new_events(origin, event_infos)
 
             try:
@@ -509,12 +605,12 @@ class FederationHandler(BaseHandler):
                     affected=event.event_id,
                 )
 
-        room = yield self.store.get_room(event.room_id)
+        room = yield self.store.get_room(room_id)
 
         if not room:
             try:
                 yield self.store.store_room(
-                    room_id=event.room_id,
+                    room_id=room_id,
                     room_creator_user_id="",
                     is_public=False,
                 )
@@ -542,7 +638,7 @@ class FederationHandler(BaseHandler):
 
                 if newly_joined:
                     user = UserID.from_string(event.state_key)
-                    yield self.user_joined_room(user, event.room_id)
+                    yield self.user_joined_room(user, room_id)
 
     @log_function
     @defer.inlineCallbacks
@@ -1056,7 +1152,8 @@ class FederationHandler(BaseHandler):
             try:
                 logger.info("Processing queued PDU %s which was received "
                             "while we were joining %s", p.event_id, p.room_id)
-                yield self.on_receive_pdu(origin, p)
+                with logcontext.nested_logging_context(p.event_id):
+                    yield self.on_receive_pdu(origin, p, sent_to_us_directly=True)
             except Exception as e:
                 logger.warn(
                     "Error handling queued PDU %s from %s: %s",
@@ -1459,12 +1556,10 @@ class FederationHandler(BaseHandler):
         else:
             defer.returnValue(None)
 
-    @log_function
     def get_min_depth_for_context(self, context):
         return self.store.get_min_depth(context)
 
     @defer.inlineCallbacks
-    @log_function
     def _handle_new_event(self, origin, event, state=None, auth_events=None,
                           backfilled=False):
         context = yield self._prep_event(
@@ -1504,15 +1599,22 @@ class FederationHandler(BaseHandler):
 
         Notifies about the events where appropriate.
         """
-        contexts = yield logcontext.make_deferred_yieldable(defer.gatherResults(
-            [
-                logcontext.run_in_background(
-                    self._prep_event,
+
+        @defer.inlineCallbacks
+        def prep(ev_info):
+            event = ev_info["event"]
+            with logcontext.nested_logging_context(suffix=event.event_id):
+                res = yield self._prep_event(
                     origin,
-                    ev_info["event"],
+                    event,
                     state=ev_info.get("state"),
                     auth_events=ev_info.get("auth_events"),
                 )
+            defer.returnValue(res)
+
+        contexts = yield logcontext.make_deferred_yieldable(defer.gatherResults(
+            [
+                logcontext.run_in_background(prep, ev_info)
                 for ev_info in event_infos
             ], consumeErrors=True,
         ))
@@ -1664,8 +1766,8 @@ class FederationHandler(BaseHandler):
             )
         except AuthError as e:
             logger.warn(
-                "Rejecting %s because %s",
-                event.event_id, e.msg
+                "[%s %s] Rejecting: %s",
+                event.room_id, event.event_id, e.msg
             )
 
             context.rejected = RejectedReason.AUTH_ERROR
diff --git a/synapse/handlers/profile.py b/synapse/handlers/profile.py
index 75b8b7ce6a..f284d5a385 100644
--- a/synapse/handlers/profile.py
+++ b/synapse/handlers/profile.py
@@ -278,7 +278,7 @@ class BaseProfileHandler(BaseHandler):
             except Exception as e:
                 logger.warn(
                     "Failed to update join event for room %s - %s",
-                    room_id, str(e.message)
+                    room_id, str(e)
                 )
 
 
diff --git a/synapse/handlers/sync.py b/synapse/handlers/sync.py
index 9bca4e7067..c7d69d9d80 100644
--- a/synapse/handlers/sync.py
+++ b/synapse/handlers/sync.py
@@ -722,6 +722,13 @@ class SyncHandler(object):
             }
 
             if full_state:
+                if lazy_load_members:
+                    # always make sure we LL ourselves so we know we're in the room
+                    # (if we are) to fix https://github.com/vector-im/riot-web/issues/7209
+                    # We only need apply this on full state syncs given we disabled
+                    # LL for incr syncs in #3840.
+                    types.append((EventTypes.Member, sync_config.user.to_string()))
+
                 if batch:
                     current_state_ids = yield self.store.get_state_ids_for_event(
                         batch.events[-1].event_id, types=types,
@@ -790,7 +797,7 @@ class SyncHandler(object):
             else:
                 state_ids = {}
                 if lazy_load_members:
-                    if types:
+                    if types and batch.events:
                         # We're returning an incremental sync, with no
                         # "gap" since the previous sync, so normally there would be
                         # no state to return.
diff --git a/synapse/http/server.py b/synapse/http/server.py
index 2d5c23e673..b4b25cab19 100644
--- a/synapse/http/server.py
+++ b/synapse/http/server.py
@@ -84,10 +84,21 @@ def wrap_json_request_handler(h):
             logger.info(
                 "%s SynapseError: %s - %s", request, code, e.msg
             )
-            respond_with_json(
-                request, code, e.error_dict(), send_cors=True,
-                pretty_print=_request_user_agent_is_curl(request),
-            )
+
+            # Only respond with an error response if we haven't already started
+            # writing, otherwise lets just kill the connection
+            if request.startedWriting:
+                if request.transport:
+                    try:
+                        request.transport.abortConnection()
+                    except Exception:
+                        # abortConnection throws if the connection is already closed
+                        pass
+            else:
+                respond_with_json(
+                    request, code, e.error_dict(), send_cors=True,
+                    pretty_print=_request_user_agent_is_curl(request),
+                )
 
         except Exception:
             # failure.Failure() fishes the original Failure out
@@ -100,16 +111,26 @@ def wrap_json_request_handler(h):
                 request,
                 f.getTraceback().rstrip(),
             )
-            respond_with_json(
-                request,
-                500,
-                {
-                    "error": "Internal server error",
-                    "errcode": Codes.UNKNOWN,
-                },
-                send_cors=True,
-                pretty_print=_request_user_agent_is_curl(request),
-            )
+            # Only respond with an error response if we haven't already started
+            # writing, otherwise lets just kill the connection
+            if request.startedWriting:
+                if request.transport:
+                    try:
+                        request.transport.abortConnection()
+                    except Exception:
+                        # abortConnection throws if the connection is already closed
+                        pass
+            else:
+                respond_with_json(
+                    request,
+                    500,
+                    {
+                        "error": "Internal server error",
+                        "errcode": Codes.UNKNOWN,
+                    },
+                    send_cors=True,
+                    pretty_print=_request_user_agent_is_curl(request),
+                )
 
     return wrap_async_request_handler(wrapped_request_handler)
 
diff --git a/synapse/http/site.py b/synapse/http/site.py
index 9579e8cd0d..50be2de3bb 100644
--- a/synapse/http/site.py
+++ b/synapse/http/site.py
@@ -75,9 +75,9 @@ class SynapseRequest(Request):
         return '<%s at 0x%x method=%r uri=%r clientproto=%r site=%r>' % (
             self.__class__.__name__,
             id(self),
-            self.method,
+            self.method.decode('ascii', errors='replace'),
             self.get_redacted_uri(),
-            self.clientproto,
+            self.clientproto.decode('ascii', errors='replace'),
             self.site.site_tag,
         )
 
@@ -308,7 +308,7 @@ class XForwardedForRequest(SynapseRequest):
             C{b"-"}.
         """
         return self.requestHeaders.getRawHeaders(
-            b"x-forwarded-for", [b"-"])[0].split(b",")[0].strip()
+            b"x-forwarded-for", [b"-"])[0].split(b",")[0].strip().decode('ascii')
 
 
 class SynapseRequestFactory(object):
diff --git a/synapse/metrics/background_process_metrics.py b/synapse/metrics/background_process_metrics.py
index 167167be0a..173908299c 100644
--- a/synapse/metrics/background_process_metrics.py
+++ b/synapse/metrics/background_process_metrics.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import threading
 
 import six
@@ -23,6 +24,9 @@ from twisted.internet import defer
 
 from synapse.util.logcontext import LoggingContext, PreserveLoggingContext
 
+logger = logging.getLogger(__name__)
+
+
 _background_process_start_count = Counter(
     "synapse_background_process_start_count",
     "Number of background processes started",
@@ -191,6 +195,8 @@ def run_as_background_process(desc, func, *args, **kwargs):
 
             try:
                 yield func(*args, **kwargs)
+            except Exception:
+                logger.exception("Background process '%s' threw an exception", desc)
             finally:
                 proc.update_metrics()
 
diff --git a/synapse/notifier.py b/synapse/notifier.py
index f1d92c1395..340b16ce25 100644
--- a/synapse/notifier.py
+++ b/synapse/notifier.py
@@ -24,9 +24,10 @@ from synapse.api.constants import EventTypes, Membership
 from synapse.api.errors import AuthError
 from synapse.handlers.presence import format_user_presence_state
 from synapse.metrics import LaterGauge
+from synapse.metrics.background_process_metrics import run_as_background_process
 from synapse.types import StreamToken
 from synapse.util.async_helpers import ObservableDeferred, timeout_deferred
-from synapse.util.logcontext import PreserveLoggingContext, run_in_background
+from synapse.util.logcontext import PreserveLoggingContext
 from synapse.util.logutils import log_function
 from synapse.util.metrics import Measure
 from synapse.visibility import filter_events_for_client
@@ -248,7 +249,10 @@ class Notifier(object):
     def _on_new_room_event(self, event, room_stream_id, extra_users=[]):
         """Notify any user streams that are interested in this room event"""
         # poke any interested application service.
-        run_in_background(self._notify_app_services, room_stream_id)
+        run_as_background_process(
+            "notify_app_services",
+            self._notify_app_services, room_stream_id,
+        )
 
         if self.federation_sender:
             self.federation_sender.notify_new_events(room_stream_id)
diff --git a/synapse/push/mailer.py b/synapse/push/mailer.py
index b78ce10396..1a5a10d974 100644
--- a/synapse/push/mailer.py
+++ b/synapse/push/mailer.py
@@ -441,7 +441,7 @@ class Mailer(object):
 
     def make_room_link(self, room_id):
         if self.hs.config.email_riot_base_url:
-            base_url = self.hs.config.email_riot_base_url
+            base_url = "%s/#/room" % (self.hs.config.email_riot_base_url)
         elif self.app_name == "Vector":
             # need /beta for Universal Links to work on iOS
             base_url = "https://vector.im/beta/#/room"
diff --git a/synapse/python_dependencies.py b/synapse/python_dependencies.py
index 0d8de600cf..0f339a0320 100644
--- a/synapse/python_dependencies.py
+++ b/synapse/python_dependencies.py
@@ -33,32 +33,35 @@ logger = logging.getLogger(__name__)
 # [2] https://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-dependencies
 REQUIREMENTS = {
     "jsonschema>=2.5.1": ["jsonschema>=2.5.1"],
-    "frozendict>=0.4": ["frozendict"],
+    "frozendict>=1": ["frozendict"],
     "unpaddedbase64>=1.1.0": ["unpaddedbase64>=1.1.0"],
     "canonicaljson>=1.1.3": ["canonicaljson>=1.1.3"],
     "signedjson>=1.0.0": ["signedjson>=1.0.0"],
     "pynacl>=1.2.1": ["nacl>=1.2.1", "nacl.bindings"],
-    "service_identity>=1.0.0": ["service_identity>=1.0.0"],
+    "service_identity>=16.0.0": ["service_identity>=16.0.0"],
     "Twisted>=17.1.0": ["twisted>=17.1.0"],
     "treq>=15.1": ["treq>=15.1"],
 
     # Twisted has required pyopenssl 16.0 since about Twisted 16.6.
     "pyopenssl>=16.0.0": ["OpenSSL>=16.0.0"],
 
-    "pyyaml": ["yaml"],
-    "pyasn1": ["pyasn1"],
-    "daemonize": ["daemonize"],
-    "bcrypt": ["bcrypt>=3.1.0"],
-    "pillow": ["PIL"],
-    "pydenticon": ["pydenticon"],
-    "sortedcontainers": ["sortedcontainers"],
-    "pysaml2>=3.0.0": ["saml2>=3.0.0"],
-    "pymacaroons-pynacl": ["pymacaroons"],
+    "pyyaml>=3.11": ["yaml"],
+    "pyasn1>=0.1.9": ["pyasn1"],
+    "pyasn1-modules>=0.0.7": ["pyasn1_modules"],
+    "daemonize>=2.3.1": ["daemonize"],
+    "bcrypt>=3.1.0": ["bcrypt>=3.1.0"],
+    "pillow>=3.1.2": ["PIL"],
+    "pydenticon>=0.2": ["pydenticon"],
+    "sortedcontainers>=1.4.4": ["sortedcontainers"],
+    "pysaml2>=3.0.0": ["saml2"],
+    "pymacaroons-pynacl>=0.9.3": ["pymacaroons"],
     "msgpack-python>=0.3.0": ["msgpack"],
     "phonenumbers>=8.2.0": ["phonenumbers"],
-    "six": ["six"],
-    "prometheus_client": ["prometheus_client"],
-    "attrs": ["attr"],
+    "six>=1.10": ["six"],
+    "prometheus_client>=0.0.18": ["prometheus_client"],
+
+    # we use attr.s(slots), which arrived in 16.0.0
+    "attrs>=16.0.0": ["attr>=16.0.0"],
     "netaddr>=0.7.18": ["netaddr"],
 }
 
diff --git a/synapse/rest/media/v1/download_resource.py b/synapse/rest/media/v1/download_resource.py
index ca90964d1d..f911b120b1 100644
--- a/synapse/rest/media/v1/download_resource.py
+++ b/synapse/rest/media/v1/download_resource.py
@@ -52,6 +52,7 @@ class DownloadResource(Resource):
             b" script-src 'none';"
             b" plugin-types application/pdf;"
             b" style-src 'unsafe-inline';"
+            b" media-src 'self';"
             b" object-src 'self';"
         )
         server_name, media_id, name = parse_media_id(request)
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index cad2dec33a..af01040a38 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -79,7 +79,6 @@ class PreviewUrlResource(Resource):
             # don't spider URLs more often than once an hour
             expiry_ms=60 * 60 * 1000,
         )
-        self._cache.start()
 
         self._cleaner_loop = self.clock.looping_call(
             self._start_expire_url_cache_data, 10 * 1000,
diff --git a/synapse/state/__init__.py b/synapse/state/__init__.py
index d7ae22a661..b22495c1f9 100644
--- a/synapse/state/__init__.py
+++ b/synapse/state/__init__.py
@@ -95,10 +95,6 @@ class StateHandler(object):
         self.hs = hs
         self._state_resolution_handler = hs.get_state_resolution_handler()
 
-    def start_caching(self):
-        # TODO: remove this shim
-        self._state_resolution_handler.start_caching()
-
     @defer.inlineCallbacks
     def get_current_state(self, room_id, event_type=None, state_key="",
                           latest_event_ids=None):
@@ -428,9 +424,6 @@ class StateResolutionHandler(object):
         self._state_cache = None
         self.resolve_linearizer = Linearizer(name="state_resolve_lock")
 
-    def start_caching(self):
-        logger.debug("start_caching")
-
         self._state_cache = ExpiringCache(
             cache_name="state_cache",
             clock=self.clock,
@@ -440,8 +433,6 @@ class StateResolutionHandler(object):
             reset_expiry_on_get=True,
         )
 
-        self._state_cache.start()
-
     @defer.inlineCallbacks
     @log_function
     def resolve_state_groups(
diff --git a/synapse/state/v1.py b/synapse/state/v1.py
index c95477d318..7a7157b352 100644
--- a/synapse/state/v1.py
+++ b/synapse/state/v1.py
@@ -65,10 +65,15 @@ def resolve_events_with_factory(state_sets, event_map, state_map_factory):
         for event_ids in itervalues(conflicted_state)
         for event_id in event_ids
     )
+    needed_event_count = len(needed_events)
     if event_map is not None:
         needed_events -= set(iterkeys(event_map))
 
-    logger.info("Asking for %d conflicted events", len(needed_events))
+    logger.info(
+        "Asking for %d/%d conflicted events",
+        len(needed_events),
+        needed_event_count,
+    )
 
     # dict[str, FrozenEvent]: a map from state event id to event. Only includes
     # the state events which are in conflict (and those in event_map)
@@ -85,11 +90,16 @@ def resolve_events_with_factory(state_sets, event_map, state_map_factory):
     )
 
     new_needed_events = set(itervalues(auth_events))
+    new_needed_event_count = len(new_needed_events)
     new_needed_events -= needed_events
     if event_map is not None:
         new_needed_events -= set(iterkeys(event_map))
 
-    logger.info("Asking for %d auth events", len(new_needed_events))
+    logger.info(
+        "Asking for %d/%d auth events",
+        len(new_needed_events),
+        new_needed_event_count,
+    )
 
     state_map_new = yield state_map_factory(new_needed_events)
     state_map.update(state_map_new)
diff --git a/synapse/storage/client_ips.py b/synapse/storage/client_ips.py
index 8fc678fa67..9ad17b7c25 100644
--- a/synapse/storage/client_ips.py
+++ b/synapse/storage/client_ips.py
@@ -119,21 +119,25 @@ class ClientIpStore(background_updates.BackgroundUpdateStore):
         for entry in iteritems(to_update):
             (user_id, access_token, ip), (user_agent, device_id, last_seen) = entry
 
-            self._simple_upsert_txn(
-                txn,
-                table="user_ips",
-                keyvalues={
-                    "user_id": user_id,
-                    "access_token": access_token,
-                    "ip": ip,
-                    "user_agent": user_agent,
-                    "device_id": device_id,
-                },
-                values={
-                    "last_seen": last_seen,
-                },
-                lock=False,
-            )
+            try:
+                self._simple_upsert_txn(
+                    txn,
+                    table="user_ips",
+                    keyvalues={
+                        "user_id": user_id,
+                        "access_token": access_token,
+                        "ip": ip,
+                        "user_agent": user_agent,
+                        "device_id": device_id,
+                    },
+                    values={
+                        "last_seen": last_seen,
+                    },
+                    lock=False,
+                )
+            except Exception as e:
+                # Failed to upsert, log and continue
+                logger.error("Failed to insert client IP %r: %r", entry, e)
 
     @defer.inlineCallbacks
     def get_last_client_ip_by_device(self, user_id, device_id):
diff --git a/synapse/storage/monthly_active_users.py b/synapse/storage/monthly_active_users.py
index 59580949f1..0fe8c8e24c 100644
--- a/synapse/storage/monthly_active_users.py
+++ b/synapse/storage/monthly_active_users.py
@@ -172,6 +172,10 @@ class MonthlyActiveUsersStore(SQLBaseStore):
             Deferred[bool]: True if a new entry was created, False if an
                 existing one was updated.
         """
+        # Am consciously deciding to lock the table on the basis that is ought
+        # never be a big table and alternative approaches (batching multiple
+        # upserts into a single txn) introduced a lot of extra complexity.
+        # See https://github.com/matrix-org/synapse/issues/3854 for more
         is_insert = yield self._simple_upsert(
             desc="upsert_monthly_active_user",
             table="monthly_active_users",
@@ -181,7 +185,6 @@ class MonthlyActiveUsersStore(SQLBaseStore):
             values={
                 "timestamp": int(self._clock.time_msec()),
             },
-            lock=False,
         )
         if is_insert:
             self.user_last_seen_monthly_active.invalidate((user_id,))
diff --git a/synapse/storage/state.py b/synapse/storage/state.py
index 4b971efdba..3f4cbd61c4 100644
--- a/synapse/storage/state.py
+++ b/synapse/storage/state.py
@@ -255,7 +255,17 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore):
         )
 
     @defer.inlineCallbacks
-    def get_state_groups_ids(self, room_id, event_ids):
+    def get_state_groups_ids(self, _room_id, event_ids):
+        """Get the event IDs of all the state for the state groups for the given events
+
+        Args:
+            _room_id (str): id of the room for these events
+            event_ids (iterable[str]): ids of the events
+
+        Returns:
+            Deferred[dict[int, dict[tuple[str, str], str]]]:
+                dict of state_group_id -> (dict of (type, state_key) -> event id)
+        """
         if not event_ids:
             defer.returnValue({})
 
@@ -270,7 +280,7 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore):
 
     @defer.inlineCallbacks
     def get_state_ids_for_group(self, state_group):
-        """Get the state IDs for the given state group
+        """Get the event IDs of all the state in the given state group
 
         Args:
             state_group (int)
@@ -286,7 +296,9 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore):
     def get_state_groups(self, room_id, event_ids):
         """ Get the state groups for the given list of event_ids
 
-        The return value is a dict mapping group names to lists of events.
+        Returns:
+            Deferred[dict[int, list[EventBase]]]:
+                dict of state_group_id -> list of state events.
         """
         if not event_ids:
             defer.returnValue({})
@@ -324,7 +336,9 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore):
                 member events (if True), or to exclude member events (if False)
 
         Returns:
-            dictionary state_group -> (dict of (type, state_key) -> event id)
+        Returns:
+            Deferred[dict[int, dict[tuple[str, str], str]]]:
+                dict of state_group_id -> (dict of (type, state_key) -> event id)
         """
         results = {}
 
@@ -732,8 +746,8 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore):
                 If None, `types` filtering is applied to all events.
 
         Returns:
-            Deferred[dict[int, dict[(type, state_key), EventBase]]]
-                a dictionary mapping from state group to state dictionary.
+            Deferred[dict[int, dict[tuple[str, str], str]]]:
+                dict of state_group_id -> (dict of (type, state_key) -> event id)
         """
         if types is not None:
             non_member_types = [t for t in types if t[0] != EventTypes.Member]
@@ -788,8 +802,8 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore):
                 If None, `types` filtering is applied to all events.
 
         Returns:
-            Deferred[dict[int, dict[(type, state_key), EventBase]]]
-                a dictionary mapping from state group to state dictionary.
+            Deferred[dict[int, dict[tuple[str, str], str]]]:
+                dict of state_group_id -> (dict of (type, state_key) -> event id)
         """
         if types:
             types = frozenset(types)
diff --git a/synapse/storage/transactions.py b/synapse/storage/transactions.py
index 0c42bd3322..baf0379a68 100644
--- a/synapse/storage/transactions.py
+++ b/synapse/storage/transactions.py
@@ -23,7 +23,6 @@ from canonicaljson import encode_canonical_json
 from twisted.internet import defer
 
 from synapse.metrics.background_process_metrics import run_as_background_process
-from synapse.util.caches.descriptors import cached
 
 from ._base import SQLBaseStore, db_to_json
 
@@ -156,7 +155,6 @@ class TransactionStore(SQLBaseStore):
         """
         pass
 
-    @cached(max_entries=10000)
     def get_destination_retry_timings(self, destination):
         """Gets the current retry timings (if any) for a given destination.
 
@@ -198,8 +196,6 @@ class TransactionStore(SQLBaseStore):
             retry_interval (int) - how long until next retry in ms
         """
 
-        # XXX: we could chose to not bother persisting this if our cache thinks
-        # this is a NOOP
         return self.runInteraction(
             "set_destination_retry_timings",
             self._set_destination_retry_timings,
@@ -212,10 +208,6 @@ class TransactionStore(SQLBaseStore):
                                        retry_last_ts, retry_interval):
         self.database_engine.lock_table(txn, "destinations")
 
-        self._invalidate_cache_and_stream(
-            txn, self.get_destination_retry_timings, (destination,)
-        )
-
         # We need to be careful here as the data may have changed from under us
         # due to a worker setting the timings.
 
diff --git a/synapse/util/caches/__init__.py b/synapse/util/caches/__init__.py
index 7b065b195e..f37d5bec08 100644
--- a/synapse/util/caches/__init__.py
+++ b/synapse/util/caches/__init__.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import os
 
 import six
@@ -20,6 +21,8 @@ from six.moves import intern
 
 from prometheus_client.core import REGISTRY, Gauge, GaugeMetricFamily
 
+logger = logging.getLogger(__name__)
+
 CACHE_SIZE_FACTOR = float(os.environ.get("SYNAPSE_CACHE_FACTOR", 0.5))
 
 
@@ -76,16 +79,20 @@ def register_cache(cache_type, cache_name, cache):
             return []
 
         def collect(self):
-            if cache_type == "response_cache":
-                response_cache_size.labels(cache_name).set(len(cache))
-                response_cache_hits.labels(cache_name).set(self.hits)
-                response_cache_evicted.labels(cache_name).set(self.evicted_size)
-                response_cache_total.labels(cache_name).set(self.hits + self.misses)
-            else:
-                cache_size.labels(cache_name).set(len(cache))
-                cache_hits.labels(cache_name).set(self.hits)
-                cache_evicted.labels(cache_name).set(self.evicted_size)
-                cache_total.labels(cache_name).set(self.hits + self.misses)
+            try:
+                if cache_type == "response_cache":
+                    response_cache_size.labels(cache_name).set(len(cache))
+                    response_cache_hits.labels(cache_name).set(self.hits)
+                    response_cache_evicted.labels(cache_name).set(self.evicted_size)
+                    response_cache_total.labels(cache_name).set(self.hits + self.misses)
+                else:
+                    cache_size.labels(cache_name).set(len(cache))
+                    cache_hits.labels(cache_name).set(self.hits)
+                    cache_evicted.labels(cache_name).set(self.evicted_size)
+                    cache_total.labels(cache_name).set(self.hits + self.misses)
+            except Exception as e:
+                logger.warn("Error calculating metrics for %s: %s", cache_name, e)
+                raise
 
             yield GaugeMetricFamily("__unused", "")
 
diff --git a/synapse/util/caches/expiringcache.py b/synapse/util/caches/expiringcache.py
index ce85b2ae11..9af4ec4aa8 100644
--- a/synapse/util/caches/expiringcache.py
+++ b/synapse/util/caches/expiringcache.py
@@ -16,6 +16,8 @@
 import logging
 from collections import OrderedDict
 
+from six import itervalues
+
 from synapse.metrics.background_process_metrics import run_as_background_process
 from synapse.util.caches import register_cache
 
@@ -54,11 +56,8 @@ class ExpiringCache(object):
 
         self.iterable = iterable
 
-        self._size_estimate = 0
-
         self.metrics = register_cache("expiring", cache_name, self)
 
-    def start(self):
         if not self._expiry_ms:
             # Don't bother starting the loop if things never expire
             return
@@ -75,16 +74,11 @@ class ExpiringCache(object):
         now = self._clock.time_msec()
         self._cache[key] = _CacheEntry(now, value)
 
-        if self.iterable:
-            self._size_estimate += len(value)
-
         # Evict if there are now too many items
         while self._max_len and len(self) > self._max_len:
             _key, value = self._cache.popitem(last=False)
             if self.iterable:
-                removed_len = len(value.value)
-                self.metrics.inc_evictions(removed_len)
-                self._size_estimate -= removed_len
+                self.metrics.inc_evictions(len(value.value))
             else:
                 self.metrics.inc_evictions()
 
@@ -135,7 +129,9 @@ class ExpiringCache(object):
         for k in keys_to_delete:
             value = self._cache.pop(k)
             if self.iterable:
-                self._size_estimate -= len(value.value)
+                self.metrics.inc_evictions(len(value.value))
+            else:
+                self.metrics.inc_evictions()
 
         logger.debug(
             "[%s] _prune_cache before: %d, after len: %d",
@@ -144,7 +140,7 @@ class ExpiringCache(object):
 
     def __len__(self):
         if self.iterable:
-            return self._size_estimate
+            return sum(len(entry.value) for entry in itervalues(self._cache))
         else:
             return len(self._cache)
 
diff --git a/synapse/util/logcontext.py b/synapse/util/logcontext.py
index a0c2d37610..89224b26cc 100644
--- a/synapse/util/logcontext.py
+++ b/synapse/util/logcontext.py
@@ -200,7 +200,7 @@ class LoggingContext(object):
 
     sentinel = Sentinel()
 
-    def __init__(self, name=None, parent_context=None):
+    def __init__(self, name=None, parent_context=None, request=None):
         self.previous_context = LoggingContext.current_context()
         self.name = name
 
@@ -218,6 +218,13 @@ class LoggingContext(object):
 
         self.parent_context = parent_context
 
+        if self.parent_context is not None:
+            self.parent_context.copy_to(self)
+
+        if request is not None:
+            # the request param overrides the request from the parent context
+            self.request = request
+
     def __str__(self):
         return "%s@%x" % (self.name, id(self))
 
@@ -256,9 +263,6 @@ class LoggingContext(object):
             )
         self.alive = True
 
-        if self.parent_context is not None:
-            self.parent_context.copy_to(self)
-
         return self
 
     def __exit__(self, type, value, traceback):
@@ -439,6 +443,35 @@ class PreserveLoggingContext(object):
                 )
 
 
+def nested_logging_context(suffix, parent_context=None):
+    """Creates a new logging context as a child of another.
+
+    The nested logging context will have a 'request' made up of the parent context's
+    request, plus the given suffix.
+
+    CPU/db usage stats will be added to the parent context's on exit.
+
+    Normal usage looks like:
+
+        with nested_logging_context(suffix):
+            # ... do stuff
+
+    Args:
+        suffix (str): suffix to add to the parent context's 'request'.
+        parent_context (LoggingContext|None): parent context. Will use the current context
+            if None.
+
+    Returns:
+        LoggingContext: new logging context.
+    """
+    if parent_context is None:
+        parent_context = LoggingContext.current_context()
+    return LoggingContext(
+        parent_context=parent_context,
+        request=parent_context.request + "-" + suffix,
+    )
+
+
 def preserve_fn(f):
     """Function decorator which wraps the function with run_in_background"""
     def g(*args, **kwargs):
diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py
index 8a3a06fd74..26cce7d197 100644
--- a/synapse/util/retryutils.py
+++ b/synapse/util/retryutils.py
@@ -188,7 +188,7 @@ class RetryDestinationLimiter(object):
             else:
                 self.retry_interval = self.min_retry_interval
 
-            logger.debug(
+            logger.info(
                 "Connection to %s was unsuccessful (%s(%s)); backoff now %i",
                 self.destination, exc_type, exc_val, self.retry_interval
             )
diff --git a/synctl b/synctl
index 1bdceda20a..356e5cb6a7 120000..100755
--- a/synctl
+++ b/synctl
@@ -1 +1,294 @@
-./synapse/app/synctl.py
\ No newline at end of file
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2014-2016 OpenMarket Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import collections
+import errno
+import glob
+import os
+import os.path
+import signal
+import subprocess
+import sys
+import time
+
+from six import iteritems
+
+import yaml
+
+SYNAPSE = [sys.executable, "-B", "-m", "synapse.app.homeserver"]
+
+GREEN = "\x1b[1;32m"
+YELLOW = "\x1b[1;33m"
+RED = "\x1b[1;31m"
+NORMAL = "\x1b[m"
+
+
+def pid_running(pid):
+    try:
+        os.kill(pid, 0)
+        return True
+    except OSError as err:
+        if err.errno == errno.EPERM:
+            return True
+        return False
+
+
+def write(message, colour=NORMAL, stream=sys.stdout):
+    if colour == NORMAL:
+        stream.write(message + "\n")
+    else:
+        stream.write(colour + message + NORMAL + "\n")
+
+
+def abort(message, colour=RED, stream=sys.stderr):
+    write(message, colour, stream)
+    sys.exit(1)
+
+
+def start(configfile):
+    write("Starting ...")
+    args = SYNAPSE
+    args.extend(["--daemonize", "-c", configfile])
+
+    try:
+        subprocess.check_call(args)
+        write("started synapse.app.homeserver(%r)" %
+              (configfile,), colour=GREEN)
+    except subprocess.CalledProcessError as e:
+        write(
+            "error starting (exit code: %d); see above for logs" % e.returncode,
+            colour=RED,
+        )
+
+
+def start_worker(app, configfile, worker_configfile):
+    args = [
+        "python", "-B",
+        "-m", app,
+        "-c", configfile,
+        "-c", worker_configfile
+    ]
+
+    try:
+        subprocess.check_call(args)
+        write("started %s(%r)" % (app, worker_configfile), colour=GREEN)
+    except subprocess.CalledProcessError as e:
+        write(
+            "error starting %s(%r) (exit code: %d); see above for logs" % (
+                app, worker_configfile, e.returncode,
+            ),
+            colour=RED,
+        )
+
+
+def stop(pidfile, app):
+    if os.path.exists(pidfile):
+        pid = int(open(pidfile).read())
+        try:
+            os.kill(pid, signal.SIGTERM)
+            write("stopped %s" % (app,), colour=GREEN)
+        except OSError as err:
+            if err.errno == errno.ESRCH:
+                write("%s not running" % (app,), colour=YELLOW)
+            elif err.errno == errno.EPERM:
+                abort("Cannot stop %s: Operation not permitted" % (app,))
+            else:
+                abort("Cannot stop %s: Unknown error" % (app,))
+
+
+Worker = collections.namedtuple("Worker", [
+    "app", "configfile", "pidfile", "cache_factor", "cache_factors",
+])
+
+
+def main():
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "action",
+        choices=["start", "stop", "restart"],
+        help="whether to start, stop or restart the synapse",
+    )
+    parser.add_argument(
+        "configfile",
+        nargs="?",
+        default="homeserver.yaml",
+        help="the homeserver config file, defaults to homeserver.yaml",
+    )
+    parser.add_argument(
+        "-w", "--worker",
+        metavar="WORKERCONFIG",
+        help="start or stop a single worker",
+    )
+    parser.add_argument(
+        "-a", "--all-processes",
+        metavar="WORKERCONFIGDIR",
+        help="start or stop all the workers in the given directory"
+             " and the main synapse process",
+    )
+
+    options = parser.parse_args()
+
+    if options.worker and options.all_processes:
+        write(
+            'Cannot use "--worker" with "--all-processes"',
+            stream=sys.stderr
+        )
+        sys.exit(1)
+
+    configfile = options.configfile
+
+    if not os.path.exists(configfile):
+        write(
+            "No config file found\n"
+            "To generate a config file, run '%s -c %s --generate-config"
+            " --server-name=<server name>'\n" % (
+                " ".join(SYNAPSE), options.configfile
+            ),
+            stream=sys.stderr,
+        )
+        sys.exit(1)
+
+    with open(configfile) as stream:
+        config = yaml.load(stream)
+
+    pidfile = config["pid_file"]
+    cache_factor = config.get("synctl_cache_factor")
+    start_stop_synapse = True
+
+    if cache_factor:
+        os.environ["SYNAPSE_CACHE_FACTOR"] = str(cache_factor)
+
+    cache_factors = config.get("synctl_cache_factors", {})
+    for cache_name, factor in iteritems(cache_factors):
+        os.environ["SYNAPSE_CACHE_FACTOR_" + cache_name.upper()] = str(factor)
+
+    worker_configfiles = []
+    if options.worker:
+        start_stop_synapse = False
+        worker_configfile = options.worker
+        if not os.path.exists(worker_configfile):
+            write(
+                "No worker config found at %r" % (worker_configfile,),
+                stream=sys.stderr,
+            )
+            sys.exit(1)
+        worker_configfiles.append(worker_configfile)
+
+    if options.all_processes:
+        # To start the main synapse with -a you need to add a worker file
+        # with worker_app == "synapse.app.homeserver"
+        start_stop_synapse = False
+        worker_configdir = options.all_processes
+        if not os.path.isdir(worker_configdir):
+            write(
+                "No worker config directory found at %r" % (worker_configdir,),
+                stream=sys.stderr,
+            )
+            sys.exit(1)
+        worker_configfiles.extend(sorted(glob.glob(
+            os.path.join(worker_configdir, "*.yaml")
+        )))
+
+    workers = []
+    for worker_configfile in worker_configfiles:
+        with open(worker_configfile) as stream:
+            worker_config = yaml.load(stream)
+        worker_app = worker_config["worker_app"]
+        if worker_app == "synapse.app.homeserver":
+            # We need to special case all of this to pick up options that may
+            # be set in the main config file or in this worker config file.
+            worker_pidfile = (
+                worker_config.get("pid_file")
+                or pidfile
+            )
+            worker_cache_factor = worker_config.get("synctl_cache_factor") or cache_factor
+            worker_cache_factors = (
+                worker_config.get("synctl_cache_factors")
+                or cache_factors
+            )
+            daemonize = worker_config.get("daemonize") or config.get("daemonize")
+            assert daemonize, "Main process must have daemonize set to true"
+
+            # The master process doesn't support using worker_* config.
+            for key in worker_config:
+                if key == "worker_app":  # But we allow worker_app
+                    continue
+                assert not key.startswith("worker_"), \
+                    "Main process cannot use worker_* config"
+        else:
+            worker_pidfile = worker_config["worker_pid_file"]
+            worker_daemonize = worker_config["worker_daemonize"]
+            assert worker_daemonize, "In config %r: expected '%s' to be True" % (
+                worker_configfile, "worker_daemonize")
+            worker_cache_factor = worker_config.get("synctl_cache_factor")
+            worker_cache_factors = worker_config.get("synctl_cache_factors", {})
+        workers.append(Worker(
+            worker_app, worker_configfile, worker_pidfile, worker_cache_factor,
+            worker_cache_factors,
+        ))
+
+    action = options.action
+
+    if action == "stop" or action == "restart":
+        for worker in workers:
+            stop(worker.pidfile, worker.app)
+
+        if start_stop_synapse:
+            stop(pidfile, "synapse.app.homeserver")
+
+    # Wait for synapse to actually shutdown before starting it again
+    if action == "restart":
+        running_pids = []
+        if start_stop_synapse and os.path.exists(pidfile):
+            running_pids.append(int(open(pidfile).read()))
+        for worker in workers:
+            if os.path.exists(worker.pidfile):
+                running_pids.append(int(open(worker.pidfile).read()))
+        if len(running_pids) > 0:
+            write("Waiting for process to exit before restarting...")
+            for running_pid in running_pids:
+                while pid_running(running_pid):
+                    time.sleep(0.2)
+            write("All processes exited; now restarting...")
+
+    if action == "start" or action == "restart":
+        if start_stop_synapse:
+            # Check if synapse is already running
+            if os.path.exists(pidfile) and pid_running(int(open(pidfile).read())):
+                abort("synapse.app.homeserver already running")
+            start(configfile)
+
+        for worker in workers:
+            env = os.environ.copy()
+
+            if worker.cache_factor:
+                os.environ["SYNAPSE_CACHE_FACTOR"] = str(worker.cache_factor)
+
+            for cache_name, factor in worker.cache_factors.iteritems():
+                os.environ["SYNAPSE_CACHE_FACTOR_" + cache_name.upper()] = str(factor)
+
+            start_worker(worker.app, configfile, worker.configfile)
+
+            # Reset env back to the original
+            os.environ.clear()
+            os.environ.update(env)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test_postgresql.sh b/test_postgresql.sh
new file mode 100755
index 0000000000..1ffcaabd31
--- /dev/null
+++ b/test_postgresql.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+# This script builds the Docker image to run the PostgreSQL tests, and then runs
+# the tests.
+
+set -e
+
+# Build, and tag
+docker build docker/ -f docker/Dockerfile-pgtests -t synapsepgtests
+
+# Run, mounting the current directory into /src
+docker run --rm -it -v $(pwd)\:/src synapsepgtests
diff --git a/tests/http/test_fedclient.py b/tests/http/test_fedclient.py
index 66c09f63b6..f3cb1423f0 100644
--- a/tests/http/test_fedclient.py
+++ b/tests/http/test_fedclient.py
@@ -54,7 +54,7 @@ class FederationClientTests(HomeserverTestCase):
     def test_client_never_connect(self):
         """
         If the HTTP request is not connected and is timed out, it'll give a
-        ConnectingCancelledError.
+        ConnectingCancelledError or TimeoutError.
         """
         d = self.cl.get_json("testserv:8008", "foo/bar", timeout=10000)
 
@@ -76,7 +76,7 @@ class FederationClientTests(HomeserverTestCase):
         self.reactor.advance(10.5)
         f = self.failureResultOf(d)
 
-        self.assertIsInstance(f.value, ConnectingCancelledError)
+        self.assertIsInstance(f.value, (ConnectingCancelledError, TimeoutError))
 
     def test_client_connect_no_response(self):
         """
diff --git a/tests/server.py b/tests/server.py
index ccea3baa55..7bee58dff1 100644
--- a/tests/server.py
+++ b/tests/server.py
@@ -98,7 +98,7 @@ class FakeSite:
         return FakeLogger()
 
 
-def make_request(method, path, content=b"", access_token=None):
+def make_request(method, path, content=b"", access_token=None, request=SynapseRequest):
     """
     Make a web request using the given method and path, feed it the
     content, and return the Request and the Channel underneath.
@@ -120,14 +120,16 @@ def make_request(method, path, content=b"", access_token=None):
     site = FakeSite()
     channel = FakeChannel()
 
-    req = SynapseRequest(site, channel)
+    req = request(site, channel)
     req.process = lambda: b""
     req.content = BytesIO(content)
 
     if access_token:
         req.requestHeaders.addRawHeader(b"Authorization", b"Bearer " + access_token)
 
-    req.requestHeaders.addRawHeader(b"X-Forwarded-For", b"127.0.0.1")
+    if content:
+        req.requestHeaders.addRawHeader(b"Content-Type", b"application/json")
+
     req.requestReceived(method, path, b"1.1")
 
     return req, channel
diff --git a/tests/storage/test_client_ips.py b/tests/storage/test_client_ips.py
index c9b02a062b..2ffbb9f14f 100644
--- a/tests/storage/test_client_ips.py
+++ b/tests/storage/test_client_ips.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright 2016 OpenMarket Ltd
+# Copyright 2018 New Vector Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,35 +13,45 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+import hashlib
+import hmac
+import json
+
 from mock import Mock
 
 from twisted.internet import defer
 
-import tests.unittest
-import tests.utils
+from synapse.http.site import XForwardedForRequest
+from synapse.rest.client.v1 import admin, login
+
+from tests import unittest
 
 
-class ClientIpStoreTestCase(tests.unittest.TestCase):
-    def __init__(self, *args, **kwargs):
-        super(ClientIpStoreTestCase, self).__init__(*args, **kwargs)
-        self.store = None  # type: synapse.storage.DataStore
-        self.clock = None  # type: tests.utils.MockClock
+class ClientIpStoreTestCase(unittest.HomeserverTestCase):
+    def make_homeserver(self, reactor, clock):
+        hs = self.setup_test_homeserver()
+        return hs
 
-    @defer.inlineCallbacks
-    def setUp(self):
-        self.hs = yield tests.utils.setup_test_homeserver(self.addCleanup)
+    def prepare(self, hs, reactor, clock):
         self.store = self.hs.get_datastore()
-        self.clock = self.hs.get_clock()
 
-    @defer.inlineCallbacks
     def test_insert_new_client_ip(self):
-        self.clock.now = 12345678
+        self.reactor.advance(12345678)
+
         user_id = "@user:id"
-        yield self.store.insert_client_ip(
-            user_id, "access_token", "ip", "user_agent", "device_id"
+        self.get_success(
+            self.store.insert_client_ip(
+                user_id, "access_token", "ip", "user_agent", "device_id"
+            )
         )
 
-        result = yield self.store.get_last_client_ip_by_device(user_id, "device_id")
+        # Trigger the storage loop
+        self.reactor.advance(10)
+
+        result = self.get_success(
+            self.store.get_last_client_ip_by_device(user_id, "device_id")
+        )
 
         r = result[(user_id, "device_id")]
         self.assertDictContainsSubset(
@@ -55,18 +66,18 @@ class ClientIpStoreTestCase(tests.unittest.TestCase):
             r,
         )
 
-    @defer.inlineCallbacks
     def test_disabled_monthly_active_user(self):
         self.hs.config.limit_usage_by_mau = False
         self.hs.config.max_mau_value = 50
         user_id = "@user:server"
-        yield self.store.insert_client_ip(
-            user_id, "access_token", "ip", "user_agent", "device_id"
+        self.get_success(
+            self.store.insert_client_ip(
+                user_id, "access_token", "ip", "user_agent", "device_id"
+            )
         )
-        active = yield self.store.user_last_seen_monthly_active(user_id)
+        active = self.get_success(self.store.user_last_seen_monthly_active(user_id))
         self.assertFalse(active)
 
-    @defer.inlineCallbacks
     def test_adding_monthly_active_user_when_full(self):
         self.hs.config.limit_usage_by_mau = True
         self.hs.config.max_mau_value = 50
@@ -76,38 +87,159 @@ class ClientIpStoreTestCase(tests.unittest.TestCase):
         self.store.get_monthly_active_count = Mock(
             return_value=defer.succeed(lots_of_users)
         )
-        yield self.store.insert_client_ip(
-            user_id, "access_token", "ip", "user_agent", "device_id"
+        self.get_success(
+            self.store.insert_client_ip(
+                user_id, "access_token", "ip", "user_agent", "device_id"
+            )
         )
-        active = yield self.store.user_last_seen_monthly_active(user_id)
+        active = self.get_success(self.store.user_last_seen_monthly_active(user_id))
         self.assertFalse(active)
 
-    @defer.inlineCallbacks
     def test_adding_monthly_active_user_when_space(self):
         self.hs.config.limit_usage_by_mau = True
         self.hs.config.max_mau_value = 50
         user_id = "@user:server"
-        active = yield self.store.user_last_seen_monthly_active(user_id)
+        active = self.get_success(self.store.user_last_seen_monthly_active(user_id))
         self.assertFalse(active)
 
-        yield self.store.insert_client_ip(
-            user_id, "access_token", "ip", "user_agent", "device_id"
+        # Trigger the saving loop
+        self.reactor.advance(10)
+
+        self.get_success(
+            self.store.insert_client_ip(
+                user_id, "access_token", "ip", "user_agent", "device_id"
+            )
         )
-        active = yield self.store.user_last_seen_monthly_active(user_id)
+        active = self.get_success(self.store.user_last_seen_monthly_active(user_id))
         self.assertTrue(active)
 
-    @defer.inlineCallbacks
     def test_updating_monthly_active_user_when_space(self):
         self.hs.config.limit_usage_by_mau = True
         self.hs.config.max_mau_value = 50
         user_id = "@user:server"
-        yield self.store.register(user_id=user_id, token="123", password_hash=None)
+        self.get_success(
+            self.store.register(user_id=user_id, token="123", password_hash=None)
+        )
 
-        active = yield self.store.user_last_seen_monthly_active(user_id)
+        active = self.get_success(self.store.user_last_seen_monthly_active(user_id))
         self.assertFalse(active)
 
-        yield self.store.insert_client_ip(
-            user_id, "access_token", "ip", "user_agent", "device_id"
+        # Trigger the saving loop
+        self.reactor.advance(10)
+
+        self.get_success(
+            self.store.insert_client_ip(
+                user_id, "access_token", "ip", "user_agent", "device_id"
+            )
         )
-        active = yield self.store.user_last_seen_monthly_active(user_id)
+        active = self.get_success(self.store.user_last_seen_monthly_active(user_id))
         self.assertTrue(active)
+
+
+class ClientIpAuthTestCase(unittest.HomeserverTestCase):
+
+    servlets = [admin.register_servlets, login.register_servlets]
+
+    def make_homeserver(self, reactor, clock):
+        hs = self.setup_test_homeserver()
+        return hs
+
+    def prepare(self, hs, reactor, clock):
+        self.hs.config.registration_shared_secret = u"shared"
+        self.store = self.hs.get_datastore()
+
+        # Create the user
+        request, channel = self.make_request("GET", "/_matrix/client/r0/admin/register")
+        self.render(request)
+        nonce = channel.json_body["nonce"]
+
+        want_mac = hmac.new(key=b"shared", digestmod=hashlib.sha1)
+        want_mac.update(nonce.encode('ascii') + b"\x00bob\x00abc123\x00admin")
+        want_mac = want_mac.hexdigest()
+
+        body = json.dumps(
+            {
+                "nonce": nonce,
+                "username": "bob",
+                "password": "abc123",
+                "admin": True,
+                "mac": want_mac,
+            }
+        )
+        request, channel = self.make_request(
+            "POST", "/_matrix/client/r0/admin/register", body.encode('utf8')
+        )
+        self.render(request)
+
+        self.assertEqual(channel.code, 200)
+        self.user_id = channel.json_body["user_id"]
+
+    def test_request_with_xforwarded(self):
+        """
+        The IP in X-Forwarded-For is entered into the client IPs table.
+        """
+        self._runtest(
+            {b"X-Forwarded-For": b"127.9.0.1"},
+            "127.9.0.1",
+            {"request": XForwardedForRequest},
+        )
+
+    def test_request_from_getPeer(self):
+        """
+        The IP returned by getPeer is entered into the client IPs table, if
+        there's no X-Forwarded-For header.
+        """
+        self._runtest({}, "127.0.0.1", {})
+
+    def _runtest(self, headers, expected_ip, make_request_args):
+        device_id = "bleb"
+
+        body = json.dumps(
+            {
+                "type": "m.login.password",
+                "user": "bob",
+                "password": "abc123",
+                "device_id": device_id,
+            }
+        )
+        request, channel = self.make_request(
+            "POST", "/_matrix/client/r0/login", body.encode('utf8'), **make_request_args
+        )
+        self.render(request)
+        self.assertEqual(channel.code, 200)
+        access_token = channel.json_body["access_token"].encode('ascii')
+
+        # Advance to a known time
+        self.reactor.advance(123456 - self.reactor.seconds())
+
+        request, channel = self.make_request(
+            "GET",
+            "/_matrix/client/r0/admin/users/" + self.user_id,
+            body.encode('utf8'),
+            access_token=access_token,
+            **make_request_args
+        )
+        request.requestHeaders.addRawHeader(b"User-Agent", b"Mozzila pizza")
+
+        # Add the optional headers
+        for h, v in headers.items():
+            request.requestHeaders.addRawHeader(h, v)
+        self.render(request)
+
+        # Advance so the save loop occurs
+        self.reactor.advance(100)
+
+        result = self.get_success(
+            self.store.get_last_client_ip_by_device(self.user_id, device_id)
+        )
+        r = result[(self.user_id, device_id)]
+        self.assertDictContainsSubset(
+            {
+                "user_id": self.user_id,
+                "device_id": device_id,
+                "ip": expected_ip,
+                "user_agent": "Mozzila pizza",
+                "last_seen": 123456100,
+            },
+            r,
+        )
diff --git a/tests/storage/test_state.py b/tests/storage/test_state.py
index b910965932..b9c5b39d59 100644
--- a/tests/storage/test_state.py
+++ b/tests/storage/test_state.py
@@ -75,6 +75,45 @@ class StateStoreTestCase(tests.unittest.TestCase):
         self.assertEqual(len(s1), len(s2))
 
     @defer.inlineCallbacks
+    def test_get_state_groups_ids(self):
+        e1 = yield self.inject_state_event(
+            self.room, self.u_alice, EventTypes.Create, '', {}
+        )
+        e2 = yield self.inject_state_event(
+            self.room, self.u_alice, EventTypes.Name, '', {"name": "test room"}
+        )
+
+        state_group_map = yield self.store.get_state_groups_ids(self.room, [e2.event_id])
+        self.assertEqual(len(state_group_map), 1)
+        state_map = list(state_group_map.values())[0]
+        self.assertDictEqual(
+            state_map,
+            {
+                (EventTypes.Create, ''): e1.event_id,
+                (EventTypes.Name, ''): e2.event_id,
+            },
+        )
+
+    @defer.inlineCallbacks
+    def test_get_state_groups(self):
+        e1 = yield self.inject_state_event(
+            self.room, self.u_alice, EventTypes.Create, '', {}
+        )
+        e2 = yield self.inject_state_event(
+            self.room, self.u_alice, EventTypes.Name, '', {"name": "test room"}
+        )
+
+        state_group_map = yield self.store.get_state_groups(
+            self.room, [e2.event_id])
+        self.assertEqual(len(state_group_map), 1)
+        state_list = list(state_group_map.values())[0]
+
+        self.assertEqual(
+            {ev.event_id for ev in state_list},
+            {e1.event_id, e2.event_id},
+        )
+
+    @defer.inlineCallbacks
     def test_get_state_for_event(self):
 
         # this defaults to a linear DAG as each new injection defaults to whatever
diff --git a/tests/test_federation.py b/tests/test_federation.py
index 2540604fcc..ff55c7a627 100644
--- a/tests/test_federation.py
+++ b/tests/test_federation.py
@@ -6,6 +6,7 @@ from twisted.internet.defer import maybeDeferred, succeed
 from synapse.events import FrozenEvent
 from synapse.types import Requester, UserID
 from synapse.util import Clock
+from synapse.util.logcontext import LoggingContext
 
 from tests import unittest
 from tests.server import ThreadedMemoryReactorClock, setup_test_homeserver
@@ -117,9 +118,10 @@ class MessageAcceptTests(unittest.TestCase):
             }
         )
 
-        d = self.handler.on_receive_pdu(
-            "test.serv", lying_event, sent_to_us_directly=True
-        )
+        with LoggingContext(request="lying_event"):
+            d = self.handler.on_receive_pdu(
+                "test.serv", lying_event, sent_to_us_directly=True
+            )
 
         # Step the reactor, so the database fetches come back
         self.reactor.advance(1)
@@ -209,11 +211,12 @@ class MessageAcceptTests(unittest.TestCase):
             }
         )
 
-        d = self.handler.on_receive_pdu(
-            "test.serv", good_event, sent_to_us_directly=True
-        )
-        self.reactor.advance(1)
-        self.assertEqual(self.successResultOf(d), None)
+        with LoggingContext(request="good_event"):
+            d = self.handler.on_receive_pdu(
+                "test.serv", good_event, sent_to_us_directly=True
+            )
+            self.reactor.advance(1)
+            self.assertEqual(self.successResultOf(d), None)
 
         bad_event = FrozenEvent(
             {
@@ -230,10 +233,11 @@ class MessageAcceptTests(unittest.TestCase):
             }
         )
 
-        d = self.handler.on_receive_pdu(
-            "test.serv", bad_event, sent_to_us_directly=True
-        )
-        self.reactor.advance(1)
+        with LoggingContext(request="bad_event"):
+            d = self.handler.on_receive_pdu(
+                "test.serv", bad_event, sent_to_us_directly=True
+            )
+            self.reactor.advance(1)
 
         extrem = maybeDeferred(
             self.homeserver.datastore.get_latest_event_ids_in_room, self.room_id
diff --git a/tests/test_server.py b/tests/test_server.py
index ef74544e93..4045fdadc3 100644
--- a/tests/test_server.py
+++ b/tests/test_server.py
@@ -1,14 +1,35 @@
+# Copyright 2018 New Vector Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
 import re
 
+from six import StringIO
+
 from twisted.internet.defer import Deferred
-from twisted.test.proto_helpers import MemoryReactorClock
+from twisted.python.failure import Failure
+from twisted.test.proto_helpers import AccumulatingProtocol, MemoryReactorClock
+from twisted.web.resource import Resource
+from twisted.web.server import NOT_DONE_YET
 
 from synapse.api.errors import Codes, SynapseError
 from synapse.http.server import JsonResource
+from synapse.http.site import SynapseSite, logger
 from synapse.util import Clock
 
 from tests import unittest
-from tests.server import make_request, render, setup_test_homeserver
+from tests.server import FakeTransport, make_request, render, setup_test_homeserver
 
 
 class JsonResourceTests(unittest.TestCase):
@@ -121,3 +142,52 @@ class JsonResourceTests(unittest.TestCase):
         self.assertEqual(channel.result["code"], b'400')
         self.assertEqual(channel.json_body["error"], "Unrecognized request")
         self.assertEqual(channel.json_body["errcode"], "M_UNRECOGNIZED")
+
+
+class SiteTestCase(unittest.HomeserverTestCase):
+    def test_lose_connection(self):
+        """
+        We log the URI correctly redacted when we lose the connection.
+        """
+
+        class HangingResource(Resource):
+            """
+            A Resource that strategically hangs, as if it were processing an
+            answer.
+            """
+
+            def render(self, request):
+                return NOT_DONE_YET
+
+        # Set up a logging handler that we can inspect afterwards
+        output = StringIO()
+        handler = logging.StreamHandler(output)
+        logger.addHandler(handler)
+        old_level = logger.level
+        logger.setLevel(10)
+        self.addCleanup(logger.setLevel, old_level)
+        self.addCleanup(logger.removeHandler, handler)
+
+        # Make a resource and a Site, the resource will hang and allow us to
+        # time out the request while it's 'processing'
+        base_resource = Resource()
+        base_resource.putChild(b'', HangingResource())
+        site = SynapseSite("test", "site_tag", {}, base_resource, "1.0")
+
+        server = site.buildProtocol(None)
+        client = AccumulatingProtocol()
+        client.makeConnection(FakeTransport(server, self.reactor))
+        server.makeConnection(FakeTransport(client, self.reactor))
+
+        # Send a request with an access token that will get redacted
+        server.dataReceived(b"GET /?access_token=bar HTTP/1.0\r\n\r\n")
+        self.pump()
+
+        # Lose the connection
+        e = Failure(Exception("Failed123"))
+        server.connectionLost(e)
+        handler.flush()
+
+        # Our access token is redacted and the failure reason is logged.
+        self.assertIn("/?access_token=<redacted>", output.getvalue())
+        self.assertIn("Failed123", output.getvalue())
diff --git a/tests/unittest.py b/tests/unittest.py
index a3d39920db..043710afaf 100644
--- a/tests/unittest.py
+++ b/tests/unittest.py
@@ -26,6 +26,7 @@ from twisted.internet.defer import Deferred
 from twisted.trial import unittest
 
 from synapse.http.server import JsonResource
+from synapse.http.site import SynapseRequest
 from synapse.server import HomeServer
 from synapse.types import UserID, create_requester
 from synapse.util.logcontext import LoggingContextFilter
@@ -120,7 +121,7 @@ class TestCase(unittest.TestCase):
             try:
                 self.assertEquals(attrs[key], getattr(obj, key))
             except AssertionError as e:
-                raise (type(e))(e.message + " for '.%s'" % key)
+                raise (type(e))(str(e) + " for '.%s'" % key)
 
     def assert_dict(self, required, actual):
         """Does a partial assert of a dict.
@@ -219,7 +220,8 @@ class HomeserverTestCase(TestCase):
 
         Function to be overridden in subclasses.
         """
-        raise NotImplementedError()
+        hs = self.setup_test_homeserver()
+        return hs
 
     def prepare(self, reactor, clock, homeserver):
         """
@@ -236,7 +238,9 @@ class HomeserverTestCase(TestCase):
         Function to optionally be overridden in subclasses.
         """
 
-    def make_request(self, method, path, content=b""):
+    def make_request(
+        self, method, path, content=b"", access_token=None, request=SynapseRequest
+    ):
         """
         Create a SynapseRequest at the path using the method and containing the
         given content.
@@ -254,7 +258,7 @@ class HomeserverTestCase(TestCase):
         if isinstance(content, dict):
             content = json.dumps(content).encode('utf8')
 
-        return make_request(method, path, content)
+        return make_request(method, path, content, access_token, request)
 
     def render(self, request):
         """
diff --git a/tests/util/test_expiring_cache.py b/tests/util/test_expiring_cache.py
index 5cbada4eda..50bc7702d2 100644
--- a/tests/util/test_expiring_cache.py
+++ b/tests/util/test_expiring_cache.py
@@ -65,7 +65,6 @@ class ExpiringCacheTestCase(unittest.TestCase):
     def test_time_eviction(self):
         clock = MockClock()
         cache = ExpiringCache("test", clock, expiry_ms=1000)
-        cache.start()
 
         cache["key"] = 1
         clock.advance_time(0.5)
diff --git a/tests/util/test_logcontext.py b/tests/util/test_logcontext.py
index 4633db77b3..8adaee3c8d 100644
--- a/tests/util/test_logcontext.py
+++ b/tests/util/test_logcontext.py
@@ -159,6 +159,11 @@ class LoggingContextTestCase(unittest.TestCase):
             self.assertEqual(r, "bum")
             self._check_test_key("one")
 
+    def test_nested_logging_context(self):
+        with LoggingContext(request="foo"):
+            nested_context = logcontext.nested_logging_context(suffix="bar")
+            self.assertEqual(nested_context.request, "foo-bar")
+
 
 # a function which returns a deferred which has been "called", but
 # which had a function which returned another incomplete deferred on
diff --git a/tests/utils.py b/tests/utils.py
index 215226debf..aaed1149c3 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -16,7 +16,9 @@
 import atexit
 import hashlib
 import os
+import time
 import uuid
+import warnings
 from inspect import getcallargs
 
 from mock import Mock, patch
@@ -237,20 +239,41 @@ def setup_test_homeserver(
         else:
             # We need to do cleanup on PostgreSQL
             def cleanup():
+                import psycopg2
+
                 # Close all the db pools
                 hs.get_db_pool().close()
 
+                dropped = False
+
                 # Drop the test database
                 db_conn = db_engine.module.connect(
                     database=POSTGRES_BASE_DB, user=POSTGRES_USER
                 )
                 db_conn.autocommit = True
                 cur = db_conn.cursor()
-                cur.execute("DROP DATABASE IF EXISTS %s;" % (test_db,))
-                db_conn.commit()
+
+                # Try a few times to drop the DB. Some things may hold on to the
+                # database for a few more seconds due to flakiness, preventing
+                # us from dropping it when the test is over. If we can't drop
+                # it, warn and move on.
+                for x in range(5):
+                    try:
+                        cur.execute("DROP DATABASE IF EXISTS %s;" % (test_db,))
+                        db_conn.commit()
+                        dropped = True
+                    except psycopg2.OperationalError as e:
+                        warnings.warn(
+                            "Couldn't drop old db: " + str(e), category=UserWarning
+                        )
+                        time.sleep(0.5)
+
                 cur.close()
                 db_conn.close()
 
+                if not dropped:
+                    warnings.warn("Failed to drop old DB.", category=UserWarning)
+
             if not LEAVE_DB:
                 # Register the cleanup hook
                 cleanup_func(cleanup)
diff --git a/tox.ini b/tox.ini
index 80ac9324df..87b5e4782d 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@ envlist = packaging, py27, py36, pep8, check_isort
 [base]
 deps =
     coverage
-    Twisted>=15.1
+    Twisted>=17.1
     mock
     python-subunit
     junitxml
@@ -64,12 +64,42 @@ setenv =
     {[base]setenv}
     SYNAPSE_POSTGRES = 1
 
+# A test suite for the oldest supported versions of Python libraries, to catch
+# any uses of APIs not available in them.
+[testenv:py27-old]
+skip_install=True
+deps =
+    # Old automat version for Twisted
+    Automat == 0.3.0
+
+    mock
+    lxml
+commands =
+    /usr/bin/find "{toxinidir}" -name '*.pyc' -delete
+    # Make all greater-thans equals so we test the oldest version of our direct
+    # dependencies, but make the pyopenssl 17.0, which can work against an
+    # OpenSSL 1.1 compiled cryptography (as older ones don't compile on Travis).
+    /bin/sh -c 'python -m synapse.python_dependencies | sed -e "s/>=/==/g" -e "s/psycopg2==2.6//" -e "s/pyopenssl==16.0.0/pyopenssl==17.0.0/" | xargs pip install'
+    # Install Synapse itself. This won't update any libraries.
+    pip install -e .
+    {envbindir}/trial {env:TRIAL_FLAGS:} {posargs:tests} {env:TOXSUFFIX:}
+
 [testenv:py35]
 usedevelop=true
 
 [testenv:py36]
 usedevelop=true
 
+[testenv:py36-postgres]
+usedevelop=true
+deps =
+    {[base]deps}
+     psycopg2
+setenv =
+    {[base]setenv}
+    SYNAPSE_POSTGRES = 1
+
+
 [testenv:packaging]
 deps =
     check-manifest