diff --git a/changelog.d/15482.feature b/changelog.d/15482.feature
new file mode 100644
index 0000000000..f3e9f2a5b2
--- /dev/null
+++ b/changelog.d/15482.feature
@@ -0,0 +1 @@
+Add admin endpoint to query the largest rooms by disk space used in the database.
diff --git a/docs/admin_api/statistics.md b/docs/admin_api/statistics.md
index 03b3621e55..2bd417e900 100644
--- a/docs/admin_api/statistics.md
+++ b/docs/admin_api/statistics.md
@@ -81,3 +81,52 @@ The following fields are returned in the JSON response body:
- `user_id` - string - Fully-qualified user ID (ex. `@user:server.com`).
* `next_token` - integer - Opaque value used for pagination. See above.
* `total` - integer - Total number of users after filtering.
+
+
+# Get largest rooms by size in database
+
+Returns the 10 largest rooms and an estimate of how much space in the database
+they are taking.
+
+This does not include the size of any associated media associated with the room.
+
+Returns an error on SQLite.
+
+*Note:* This uses the planner statistics from PostgreSQL to do the estimates,
+which means that the returned information can vary widely from reality. However,
+it should be enough to get a rough idea of where database disk space is going.
+
+
+The API is:
+
+```
+GET /_synapse/admin/v1/statistics/statistics/database/rooms
+```
+
+A response body like the following is returned:
+
+```json
+{
+ "rooms": [
+ {
+ "room_id": "!OGEhHVWSdvArJzumhm:matrix.org",
+ "estimated_size": 47325417353
+ }
+ ],
+}
+```
+
+
+
+**Response**
+
+The following fields are returned in the JSON response body:
+
+* `rooms` - An array of objects, sorted by largest room first. Objects contain
+ the following fields:
+ - `room_id` - string - The room ID.
+ - `estimated_size` - integer - Estimated disk space used in bytes by the room
+ in the database.
+
+
+*Added in Synapse 1.83.0*
diff --git a/synapse/rest/admin/__init__.py b/synapse/rest/admin/__init__.py
index 79f22a59f1..770df261ce 100644
--- a/synapse/rest/admin/__init__.py
+++ b/synapse/rest/admin/__init__.py
@@ -68,7 +68,10 @@ from synapse.rest.admin.rooms import (
RoomTimestampToEventRestServlet,
)
from synapse.rest.admin.server_notice_servlet import SendServerNoticeServlet
-from synapse.rest.admin.statistics import UserMediaStatisticsRestServlet
+from synapse.rest.admin.statistics import (
+ LargestRoomsStatistics,
+ UserMediaStatisticsRestServlet,
+)
from synapse.rest.admin.username_available import UsernameAvailableRestServlet
from synapse.rest.admin.users import (
AccountDataRestServlet,
@@ -259,6 +262,7 @@ def register_servlets(hs: "HomeServer", http_server: HttpServer) -> None:
UserRestServletV2(hs).register(http_server)
UsersRestServletV2(hs).register(http_server)
UserMediaStatisticsRestServlet(hs).register(http_server)
+ LargestRoomsStatistics(hs).register(http_server)
EventReportDetailRestServlet(hs).register(http_server)
EventReportsRestServlet(hs).register(http_server)
AccountDataRestServlet(hs).register(http_server)
diff --git a/synapse/rest/admin/statistics.py b/synapse/rest/admin/statistics.py
index 9c45f4650d..19780e4b4c 100644
--- a/synapse/rest/admin/statistics.py
+++ b/synapse/rest/admin/statistics.py
@@ -113,3 +113,28 @@ class UserMediaStatisticsRestServlet(RestServlet):
ret["next_token"] = start + len(users_media)
return HTTPStatus.OK, ret
+
+
+class LargestRoomsStatistics(RestServlet):
+ """Get the largest rooms by database size.
+
+ Only works when using PostgreSQL.
+ """
+
+ PATTERNS = admin_patterns("/statistics/database/rooms$")
+
+ def __init__(self, hs: "HomeServer"):
+ self.auth = hs.get_auth()
+ self.stats_controller = hs.get_storage_controllers().stats
+
+ async def on_GET(self, request: SynapseRequest) -> Tuple[int, JsonDict]:
+ await assert_requester_is_admin(self.auth, request)
+
+ room_sizes = await self.stats_controller.get_room_db_size_estimate()
+
+ return HTTPStatus.OK, {
+ "rooms": [
+ {"room_id": room_id, "estimated_size": size}
+ for room_id, size in room_sizes
+ ]
+ }
diff --git a/synapse/storage/controllers/__init__.py b/synapse/storage/controllers/__init__.py
index 45101cda7a..0ef8602631 100644
--- a/synapse/storage/controllers/__init__.py
+++ b/synapse/storage/controllers/__init__.py
@@ -19,6 +19,7 @@ from synapse.storage.controllers.persist_events import (
)
from synapse.storage.controllers.purge_events import PurgeEventsStorageController
from synapse.storage.controllers.state import StateStorageController
+from synapse.storage.controllers.stats import StatsController
from synapse.storage.databases import Databases
from synapse.storage.databases.main import DataStore
@@ -40,6 +41,7 @@ class StorageControllers:
self.purge_events = PurgeEventsStorageController(hs, stores)
self.state = StateStorageController(hs, stores)
+ self.stats = StatsController(hs, stores)
self.persistence = None
if stores.persist_events:
diff --git a/synapse/storage/controllers/stats.py b/synapse/storage/controllers/stats.py
new file mode 100644
index 0000000000..988e44c6af
--- /dev/null
+++ b/synapse/storage/controllers/stats.py
@@ -0,0 +1,113 @@
+# Copyright 2023 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from collections import Counter
+from typing import TYPE_CHECKING, Collection, List, Tuple
+
+from synapse.api.errors import SynapseError
+from synapse.storage.database import LoggingTransaction
+from synapse.storage.databases import Databases
+from synapse.storage.engines import PostgresEngine
+
+if TYPE_CHECKING:
+ from synapse.server import HomeServer
+
+logger = logging.getLogger(__name__)
+
+
+class StatsController:
+ """High level interface for getting statistics."""
+
+ def __init__(self, hs: "HomeServer", stores: Databases):
+ self.stores = stores
+
+ async def get_room_db_size_estimate(self) -> List[Tuple[str, int]]:
+ """Get an estimate of the largest rooms and how much database space they
+ use, in bytes.
+
+ Only works against PostgreSQL.
+
+ Note: this uses the postgres statistics so is a very rough estimate.
+ """
+
+ # Note: We look at both tables on the main and state databases.
+ if not isinstance(self.stores.main.database_engine, PostgresEngine):
+ raise SynapseError(400, "Endpoint requires using PostgreSQL")
+
+ if not isinstance(self.stores.state.database_engine, PostgresEngine):
+ raise SynapseError(400, "Endpoint requires using PostgreSQL")
+
+ # For each "large" table, we go through and get the largest rooms
+ # and an estimate of how much space they take. We can then sum the
+ # results and return the top 10.
+ #
+ # This isn't the most accurate, but given all of these are estimates
+ # anyway its good enough.
+ room_estimates: Counter[str] = Counter()
+
+ # Return size of the table on disk, including indexes and TOAST.
+ table_sql = """
+ SELECT pg_total_relation_size(?)
+ """
+
+ # Get an estimate for the largest rooms and their frequency.
+ #
+ # Note: the cast here is a hack to cast from `anyarray` to an actual
+ # type. This ensures that psycopg2 passes us a back a a Python list.
+ column_sql = """
+ SELECT
+ most_common_vals::TEXT::TEXT[], most_common_freqs::TEXT::NUMERIC[]
+ FROM pg_stats
+ WHERE tablename = ? and attname = 'room_id'
+ """
+
+ def get_room_db_size_estimate_txn(
+ txn: LoggingTransaction,
+ tables: Collection[str],
+ ) -> None:
+ for table in tables:
+ txn.execute(table_sql, (table,))
+ row = txn.fetchone()
+ assert row is not None
+ (table_size,) = row
+
+ txn.execute(column_sql, (table,))
+ row = txn.fetchone()
+ assert row is not None
+ vals, freqs = row
+
+ for room_id, freq in zip(vals, freqs):
+ room_estimates[room_id] += int(freq * table_size)
+
+ await self.stores.main.db_pool.runInteraction(
+ "get_room_db_size_estimate_main",
+ get_room_db_size_estimate_txn,
+ (
+ "event_json",
+ "events",
+ "event_search",
+ "event_edges",
+ "event_push_actions",
+ "stream_ordering_to_exterm",
+ ),
+ )
+
+ await self.stores.state.db_pool.runInteraction(
+ "get_room_db_size_estimate_state",
+ get_room_db_size_estimate_txn,
+ ("state_groups_state",),
+ )
+
+ return room_estimates.most_common(10)
|