summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--changelog.d/15482.feature1
-rw-r--r--docs/admin_api/statistics.md49
-rw-r--r--synapse/rest/admin/__init__.py6
-rw-r--r--synapse/rest/admin/statistics.py25
-rw-r--r--synapse/storage/controllers/__init__.py2
-rw-r--r--synapse/storage/controllers/stats.py113
6 files changed, 195 insertions, 1 deletions
diff --git a/changelog.d/15482.feature b/changelog.d/15482.feature
new file mode 100644
index 0000000000..f3e9f2a5b2
--- /dev/null
+++ b/changelog.d/15482.feature
@@ -0,0 +1 @@
+Add admin endpoint to query the largest rooms by disk space used in the database.
diff --git a/docs/admin_api/statistics.md b/docs/admin_api/statistics.md
index 03b3621e55..2bd417e900 100644
--- a/docs/admin_api/statistics.md
+++ b/docs/admin_api/statistics.md
@@ -81,3 +81,52 @@ The following fields are returned in the JSON response body:
   - `user_id` - string - Fully-qualified user ID (ex. `@user:server.com`).
 * `next_token` - integer - Opaque value used for pagination. See above.
 * `total` - integer - Total number of users after filtering.
+
+
+# Get largest rooms by size in database
+
+Returns the 10 largest rooms and an estimate of how much space in the database
+they are taking.
+
+This does not include the size of any associated media associated with the room.
+
+Returns an error on SQLite.
+
+*Note:* This uses the planner statistics from PostgreSQL to do the estimates,
+which means that the returned information can vary widely from reality. However,
+it should be enough to get a rough idea of where database disk space is going.
+
+
+The API is:
+
+```
+GET /_synapse/admin/v1/statistics/statistics/database/rooms
+```
+
+A response body like the following is returned:
+
+```json
+{
+  "rooms": [
+    {
+      "room_id": "!OGEhHVWSdvArJzumhm:matrix.org",
+      "estimated_size": 47325417353
+    }
+  ],
+}
+```
+
+
+
+**Response**
+
+The following fields are returned in the JSON response body:
+
+* `rooms` - An array of objects, sorted by largest room first. Objects contain
+  the following fields:
+  - `room_id` - string - The room ID.
+  - `estimated_size` - integer - Estimated disk space used in bytes by the room
+    in the database.
+
+
+*Added in Synapse 1.83.0*
diff --git a/synapse/rest/admin/__init__.py b/synapse/rest/admin/__init__.py
index 79f22a59f1..770df261ce 100644
--- a/synapse/rest/admin/__init__.py
+++ b/synapse/rest/admin/__init__.py
@@ -68,7 +68,10 @@ from synapse.rest.admin.rooms import (
     RoomTimestampToEventRestServlet,
 )
 from synapse.rest.admin.server_notice_servlet import SendServerNoticeServlet
-from synapse.rest.admin.statistics import UserMediaStatisticsRestServlet
+from synapse.rest.admin.statistics import (
+    LargestRoomsStatistics,
+    UserMediaStatisticsRestServlet,
+)
 from synapse.rest.admin.username_available import UsernameAvailableRestServlet
 from synapse.rest.admin.users import (
     AccountDataRestServlet,
@@ -259,6 +262,7 @@ def register_servlets(hs: "HomeServer", http_server: HttpServer) -> None:
     UserRestServletV2(hs).register(http_server)
     UsersRestServletV2(hs).register(http_server)
     UserMediaStatisticsRestServlet(hs).register(http_server)
+    LargestRoomsStatistics(hs).register(http_server)
     EventReportDetailRestServlet(hs).register(http_server)
     EventReportsRestServlet(hs).register(http_server)
     AccountDataRestServlet(hs).register(http_server)
diff --git a/synapse/rest/admin/statistics.py b/synapse/rest/admin/statistics.py
index 9c45f4650d..19780e4b4c 100644
--- a/synapse/rest/admin/statistics.py
+++ b/synapse/rest/admin/statistics.py
@@ -113,3 +113,28 @@ class UserMediaStatisticsRestServlet(RestServlet):
             ret["next_token"] = start + len(users_media)
 
         return HTTPStatus.OK, ret
+
+
+class LargestRoomsStatistics(RestServlet):
+    """Get the largest rooms by database size.
+
+    Only works when using PostgreSQL.
+    """
+
+    PATTERNS = admin_patterns("/statistics/database/rooms$")
+
+    def __init__(self, hs: "HomeServer"):
+        self.auth = hs.get_auth()
+        self.stats_controller = hs.get_storage_controllers().stats
+
+    async def on_GET(self, request: SynapseRequest) -> Tuple[int, JsonDict]:
+        await assert_requester_is_admin(self.auth, request)
+
+        room_sizes = await self.stats_controller.get_room_db_size_estimate()
+
+        return HTTPStatus.OK, {
+            "rooms": [
+                {"room_id": room_id, "estimated_size": size}
+                for room_id, size in room_sizes
+            ]
+        }
diff --git a/synapse/storage/controllers/__init__.py b/synapse/storage/controllers/__init__.py
index 45101cda7a..0ef8602631 100644
--- a/synapse/storage/controllers/__init__.py
+++ b/synapse/storage/controllers/__init__.py
@@ -19,6 +19,7 @@ from synapse.storage.controllers.persist_events import (
 )
 from synapse.storage.controllers.purge_events import PurgeEventsStorageController
 from synapse.storage.controllers.state import StateStorageController
+from synapse.storage.controllers.stats import StatsController
 from synapse.storage.databases import Databases
 from synapse.storage.databases.main import DataStore
 
@@ -40,6 +41,7 @@ class StorageControllers:
 
         self.purge_events = PurgeEventsStorageController(hs, stores)
         self.state = StateStorageController(hs, stores)
+        self.stats = StatsController(hs, stores)
 
         self.persistence = None
         if stores.persist_events:
diff --git a/synapse/storage/controllers/stats.py b/synapse/storage/controllers/stats.py
new file mode 100644
index 0000000000..988e44c6af
--- /dev/null
+++ b/synapse/storage/controllers/stats.py
@@ -0,0 +1,113 @@
+# Copyright 2023 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from collections import Counter
+from typing import TYPE_CHECKING, Collection, List, Tuple
+
+from synapse.api.errors import SynapseError
+from synapse.storage.database import LoggingTransaction
+from synapse.storage.databases import Databases
+from synapse.storage.engines import PostgresEngine
+
+if TYPE_CHECKING:
+    from synapse.server import HomeServer
+
+logger = logging.getLogger(__name__)
+
+
+class StatsController:
+    """High level interface for getting statistics."""
+
+    def __init__(self, hs: "HomeServer", stores: Databases):
+        self.stores = stores
+
+    async def get_room_db_size_estimate(self) -> List[Tuple[str, int]]:
+        """Get an estimate of the largest rooms and how much database space they
+        use, in bytes.
+
+        Only works against PostgreSQL.
+
+        Note: this uses the postgres statistics so is a very rough estimate.
+        """
+
+        # Note: We look at both tables on the main and state databases.
+        if not isinstance(self.stores.main.database_engine, PostgresEngine):
+            raise SynapseError(400, "Endpoint requires using PostgreSQL")
+
+        if not isinstance(self.stores.state.database_engine, PostgresEngine):
+            raise SynapseError(400, "Endpoint requires using PostgreSQL")
+
+        # For each "large" table, we go through and get the largest rooms
+        # and an estimate of how much space they take. We can then sum the
+        # results and return the top 10.
+        #
+        # This isn't the most accurate, but given all of these are estimates
+        # anyway its good enough.
+        room_estimates: Counter[str] = Counter()
+
+        # Return size of the table on disk, including indexes and TOAST.
+        table_sql = """
+            SELECT pg_total_relation_size(?)
+        """
+
+        # Get an estimate for the largest rooms and their frequency.
+        #
+        # Note: the cast here is a hack to cast from `anyarray` to an actual
+        # type. This ensures that psycopg2 passes us a back a a Python list.
+        column_sql = """
+            SELECT
+                most_common_vals::TEXT::TEXT[], most_common_freqs::TEXT::NUMERIC[]
+            FROM pg_stats
+            WHERE tablename = ? and attname = 'room_id'
+        """
+
+        def get_room_db_size_estimate_txn(
+            txn: LoggingTransaction,
+            tables: Collection[str],
+        ) -> None:
+            for table in tables:
+                txn.execute(table_sql, (table,))
+                row = txn.fetchone()
+                assert row is not None
+                (table_size,) = row
+
+                txn.execute(column_sql, (table,))
+                row = txn.fetchone()
+                assert row is not None
+                vals, freqs = row
+
+                for room_id, freq in zip(vals, freqs):
+                    room_estimates[room_id] += int(freq * table_size)
+
+        await self.stores.main.db_pool.runInteraction(
+            "get_room_db_size_estimate_main",
+            get_room_db_size_estimate_txn,
+            (
+                "event_json",
+                "events",
+                "event_search",
+                "event_edges",
+                "event_push_actions",
+                "stream_ordering_to_exterm",
+            ),
+        )
+
+        await self.stores.state.db_pool.runInteraction(
+            "get_room_db_size_estimate_state",
+            get_room_db_size_estimate_txn,
+            ("state_groups_state",),
+        )
+
+        return room_estimates.most_common(10)