diff --git a/synapse/media/url_previewer.py b/synapse/media/url_previewer.py
new file mode 100644
index 0000000000..c8a4a809f1
--- /dev/null
+++ b/synapse/media/url_previewer.py
@@ -0,0 +1,833 @@
+# Copyright 2016 OpenMarket Ltd
+# Copyright 2020-2023 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import datetime
+import errno
+import fnmatch
+import logging
+import os
+import re
+import shutil
+import sys
+import traceback
+from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple
+from urllib.parse import urljoin, urlparse, urlsplit
+from urllib.request import urlopen
+
+import attr
+
+from twisted.internet.defer import Deferred
+from twisted.internet.error import DNSLookupError
+
+from synapse.api.errors import Codes, SynapseError
+from synapse.http.client import SimpleHttpClient
+from synapse.logging.context import make_deferred_yieldable, run_in_background
+from synapse.media._base import FileInfo, get_filename_from_headers
+from synapse.media.media_storage import MediaStorage
+from synapse.media.oembed import OEmbedProvider
+from synapse.media.preview_html import decode_body, parse_html_to_open_graph
+from synapse.metrics.background_process_metrics import run_as_background_process
+from synapse.types import JsonDict, UserID
+from synapse.util import json_encoder
+from synapse.util.async_helpers import ObservableDeferred
+from synapse.util.caches.expiringcache import ExpiringCache
+from synapse.util.stringutils import random_string
+
+if TYPE_CHECKING:
+ from synapse.media.media_repository import MediaRepository
+ from synapse.server import HomeServer
+
+logger = logging.getLogger(__name__)
+
+OG_TAG_NAME_MAXLEN = 50
+OG_TAG_VALUE_MAXLEN = 1000
+
+ONE_HOUR = 60 * 60 * 1000
+ONE_DAY = 24 * ONE_HOUR
+IMAGE_CACHE_EXPIRY_MS = 2 * ONE_DAY
+
+
+@attr.s(slots=True, frozen=True, auto_attribs=True)
+class DownloadResult:
+ length: int
+ uri: str
+ response_code: int
+ media_type: str
+ download_name: Optional[str]
+ expires: int
+ etag: Optional[str]
+
+
+@attr.s(slots=True, frozen=True, auto_attribs=True)
+class MediaInfo:
+ """
+ Information parsed from downloading media being previewed.
+ """
+
+ # The Content-Type header of the response.
+ media_type: str
+ # The length (in bytes) of the downloaded media.
+ media_length: int
+ # The media filename, according to the server. This is parsed from the
+ # returned headers, if possible.
+ download_name: Optional[str]
+ # The time of the preview.
+ created_ts_ms: int
+ # Information from the media storage provider about where the file is stored
+ # on disk.
+ filesystem_id: str
+ filename: str
+ # The URI being previewed.
+ uri: str
+ # The HTTP response code.
+ response_code: int
+ # The timestamp (in milliseconds) of when this preview expires.
+ expires: int
+ # The ETag header of the response.
+ etag: Optional[str]
+
+
+class UrlPreviewer:
+ """
+ Generates an Open Graph (https://ogp.me/) responses (with some Matrix
+ specific additions) for a given URL.
+
+ When Synapse is asked to preview a URL it does the following:
+
+ 1. Checks against a URL blacklist (defined as `url_preview_url_blacklist` in the
+ config).
+ 2. Checks the URL against an in-memory cache and returns the result if it exists. (This
+ is also used to de-duplicate processing of multiple in-flight requests at once.)
+ 3. Kicks off a background process to generate a preview:
+ 1. Checks URL and timestamp against the database cache and returns the result if it
+ has not expired and was successful (a 2xx return code).
+ 2. Checks if the URL matches an oEmbed (https://oembed.com/) pattern. If it
+ does, update the URL to download.
+ 3. Downloads the URL and stores it into a file via the media storage provider
+ and saves the local media metadata.
+ 4. If the media is an image:
+ 1. Generates thumbnails.
+ 2. Generates an Open Graph response based on image properties.
+ 5. If the media is HTML:
+ 1. Decodes the HTML via the stored file.
+ 2. Generates an Open Graph response from the HTML.
+ 3. If a JSON oEmbed URL was found in the HTML via autodiscovery:
+ 1. Downloads the URL and stores it into a file via the media storage provider
+ and saves the local media metadata.
+ 2. Convert the oEmbed response to an Open Graph response.
+ 3. Override any Open Graph data from the HTML with data from oEmbed.
+ 4. If an image exists in the Open Graph response:
+ 1. Downloads the URL and stores it into a file via the media storage
+ provider and saves the local media metadata.
+ 2. Generates thumbnails.
+ 3. Updates the Open Graph response based on image properties.
+ 6. If the media is JSON and an oEmbed URL was found:
+ 1. Convert the oEmbed response to an Open Graph response.
+ 2. If a thumbnail or image is in the oEmbed response:
+ 1. Downloads the URL and stores it into a file via the media storage
+ provider and saves the local media metadata.
+ 2. Generates thumbnails.
+ 3. Updates the Open Graph response based on image properties.
+ 7. Stores the result in the database cache.
+ 4. Returns the result.
+
+ If any additional requests (e.g. from oEmbed autodiscovery, step 5.3 or
+ image thumbnailing, step 5.4 or 6.4) fails then the URL preview as a whole
+ does not fail. As much information as possible is returned.
+
+ The in-memory cache expires after 1 hour.
+
+ Expired entries in the database cache (and their associated media files) are
+ deleted every 10 seconds. The default expiration time is 1 hour from download.
+ """
+
+ def __init__(
+ self,
+ hs: "HomeServer",
+ media_repo: "MediaRepository",
+ media_storage: MediaStorage,
+ ):
+ self.clock = hs.get_clock()
+ self.filepaths = media_repo.filepaths
+ self.max_spider_size = hs.config.media.max_spider_size
+ self.server_name = hs.hostname
+ self.store = hs.get_datastores().main
+ self.client = SimpleHttpClient(
+ hs,
+ treq_args={"browser_like_redirects": True},
+ ip_whitelist=hs.config.media.url_preview_ip_range_whitelist,
+ ip_blacklist=hs.config.media.url_preview_ip_range_blacklist,
+ use_proxy=True,
+ )
+ self.media_repo = media_repo
+ self.primary_base_path = media_repo.primary_base_path
+ self.media_storage = media_storage
+
+ self._oembed = OEmbedProvider(hs)
+
+ # We run the background jobs if we're the instance specified (or no
+ # instance is specified, where we assume there is only one instance
+ # serving media).
+ instance_running_jobs = hs.config.media.media_instance_running_background_jobs
+ self._worker_run_media_background_jobs = (
+ instance_running_jobs is None
+ or instance_running_jobs == hs.get_instance_name()
+ )
+
+ self.url_preview_url_blacklist = hs.config.media.url_preview_url_blacklist
+ self.url_preview_accept_language = hs.config.media.url_preview_accept_language
+
+ # memory cache mapping urls to an ObservableDeferred returning
+ # JSON-encoded OG metadata
+ self._cache: ExpiringCache[str, ObservableDeferred] = ExpiringCache(
+ cache_name="url_previews",
+ clock=self.clock,
+ # don't spider URLs more often than once an hour
+ expiry_ms=ONE_HOUR,
+ )
+
+ if self._worker_run_media_background_jobs:
+ self._cleaner_loop = self.clock.looping_call(
+ self._start_expire_url_cache_data, 10 * 1000
+ )
+
+ async def preview(self, url: str, user: UserID, ts: int) -> bytes:
+ # XXX: we could move this into _do_preview if we wanted.
+ url_tuple = urlsplit(url)
+ for entry in self.url_preview_url_blacklist:
+ match = True
+ for attrib in entry:
+ pattern = entry[attrib]
+ value = getattr(url_tuple, attrib)
+ logger.debug(
+ "Matching attrib '%s' with value '%s' against pattern '%s'",
+ attrib,
+ value,
+ pattern,
+ )
+
+ if value is None:
+ match = False
+ continue
+
+ # Some attributes might not be parsed as strings by urlsplit (such as the
+ # port, which is parsed as an int). Because we use match functions that
+ # expect strings, we want to make sure that's what we give them.
+ value_str = str(value)
+
+ if pattern.startswith("^"):
+ if not re.match(pattern, value_str):
+ match = False
+ continue
+ else:
+ if not fnmatch.fnmatch(value_str, pattern):
+ match = False
+ continue
+ if match:
+ logger.warning("URL %s blocked by url_blacklist entry %s", url, entry)
+ raise SynapseError(
+ 403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN
+ )
+
+ # the in-memory cache:
+ # * ensures that only one request is active at a time
+ # * takes load off the DB for the thundering herds
+ # * also caches any failures (unlike the DB) so we don't keep
+ # requesting the same endpoint
+
+ observable = self._cache.get(url)
+
+ if not observable:
+ download = run_in_background(self._do_preview, url, user, ts)
+ observable = ObservableDeferred(download, consumeErrors=True)
+ self._cache[url] = observable
+ else:
+ logger.info("Returning cached response")
+
+ return await make_deferred_yieldable(observable.observe())
+
+ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
+ """Check the db, and download the URL and build a preview
+
+ Args:
+ url: The URL to preview.
+ user: The user requesting the preview.
+ ts: The timestamp requested for the preview.
+
+ Returns:
+ json-encoded og data
+ """
+ # check the URL cache in the DB (which will also provide us with
+ # historical previews, if we have any)
+ cache_result = await self.store.get_url_cache(url, ts)
+ if (
+ cache_result
+ and cache_result["expires_ts"] > ts
+ and cache_result["response_code"] / 100 == 2
+ ):
+ # It may be stored as text in the database, not as bytes (such as
+ # PostgreSQL). If so, encode it back before handing it on.
+ og = cache_result["og"]
+ if isinstance(og, str):
+ og = og.encode("utf8")
+ return og
+
+ # If this URL can be accessed via oEmbed, use that instead.
+ url_to_download = url
+ oembed_url = self._oembed.get_oembed_url(url)
+ if oembed_url:
+ url_to_download = oembed_url
+
+ media_info = await self._handle_url(url_to_download, user)
+
+ logger.debug("got media_info of '%s'", media_info)
+
+ # The number of milliseconds that the response should be considered valid.
+ expiration_ms = media_info.expires
+ author_name: Optional[str] = None
+
+ if _is_media(media_info.media_type):
+ file_id = media_info.filesystem_id
+ dims = await self.media_repo._generate_thumbnails(
+ None, file_id, file_id, media_info.media_type, url_cache=True
+ )
+
+ og = {
+ "og:description": media_info.download_name,
+ "og:image": f"mxc://{self.server_name}/{media_info.filesystem_id}",
+ "og:image:type": media_info.media_type,
+ "matrix:image:size": media_info.media_length,
+ }
+
+ if dims:
+ og["og:image:width"] = dims["width"]
+ og["og:image:height"] = dims["height"]
+ else:
+ logger.warning("Couldn't get dims for %s" % url)
+
+ # define our OG response for this media
+ elif _is_html(media_info.media_type):
+ # TODO: somehow stop a big HTML tree from exploding synapse's RAM
+
+ with open(media_info.filename, "rb") as file:
+ body = file.read()
+
+ tree = decode_body(body, media_info.uri, media_info.media_type)
+ if tree is not None:
+ # Check if this HTML document points to oEmbed information and
+ # defer to that.
+ oembed_url = self._oembed.autodiscover_from_html(tree)
+ og_from_oembed: JsonDict = {}
+ if oembed_url:
+ try:
+ oembed_info = await self._handle_url(
+ oembed_url, user, allow_data_urls=True
+ )
+ except Exception as e:
+ # Fetching the oEmbed info failed, don't block the entire URL preview.
+ logger.warning(
+ "oEmbed fetch failed during URL preview: %s errored with %s",
+ oembed_url,
+ e,
+ )
+ else:
+ (
+ og_from_oembed,
+ author_name,
+ expiration_ms,
+ ) = await self._handle_oembed_response(
+ url, oembed_info, expiration_ms
+ )
+
+ # Parse Open Graph information from the HTML in case the oEmbed
+ # response failed or is incomplete.
+ og_from_html = parse_html_to_open_graph(tree)
+
+ # Compile the Open Graph response by using the scraped
+ # information from the HTML and overlaying any information
+ # from the oEmbed response.
+ og = {**og_from_html, **og_from_oembed}
+
+ await self._precache_image_url(user, media_info, og)
+ else:
+ og = {}
+
+ elif oembed_url:
+ # Handle the oEmbed information.
+ og, author_name, expiration_ms = await self._handle_oembed_response(
+ url, media_info, expiration_ms
+ )
+ await self._precache_image_url(user, media_info, og)
+
+ else:
+ logger.warning("Failed to find any OG data in %s", url)
+ og = {}
+
+ # If we don't have a title but we have author_name, copy it as
+ # title
+ if not og.get("og:title") and author_name:
+ og["og:title"] = author_name
+
+ # filter out any stupidly long values
+ keys_to_remove = []
+ for k, v in og.items():
+ # values can be numeric as well as strings, hence the cast to str
+ if len(k) > OG_TAG_NAME_MAXLEN or len(str(v)) > OG_TAG_VALUE_MAXLEN:
+ logger.warning(
+ "Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN]
+ )
+ keys_to_remove.append(k)
+ for k in keys_to_remove:
+ del og[k]
+
+ logger.debug("Calculated OG for %s as %s", url, og)
+
+ jsonog = json_encoder.encode(og)
+
+ # Cap the amount of time to consider a response valid.
+ expiration_ms = min(expiration_ms, ONE_DAY)
+
+ # store OG in history-aware DB cache
+ await self.store.store_url_cache(
+ url,
+ media_info.response_code,
+ media_info.etag,
+ media_info.created_ts_ms + expiration_ms,
+ jsonog,
+ media_info.filesystem_id,
+ media_info.created_ts_ms,
+ )
+
+ return jsonog.encode("utf8")
+
+ async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResult:
+ """
+ Fetches a remote URL and parses the headers.
+
+ Args:
+ url: The URL to fetch.
+ output_stream: The stream to write the content to.
+
+ Returns:
+ A tuple of:
+ Media length, URL downloaded, the HTTP response code,
+ the media type, the downloaded file name, the number of
+ milliseconds the result is valid for, the etag header.
+ """
+
+ try:
+ logger.debug("Trying to get preview for url '%s'", url)
+ length, headers, uri, code = await self.client.get_file(
+ url,
+ output_stream=output_stream,
+ max_size=self.max_spider_size,
+ headers={
+ b"Accept-Language": self.url_preview_accept_language,
+ # Use a custom user agent for the preview because some sites will only return
+ # Open Graph metadata to crawler user agents. Omit the Synapse version
+ # string to avoid leaking information.
+ b"User-Agent": [
+ "Synapse (bot; +https://github.com/matrix-org/synapse)"
+ ],
+ },
+ is_allowed_content_type=_is_previewable,
+ )
+ except SynapseError:
+ # Pass SynapseErrors through directly, so that the servlet
+ # handler will return a SynapseError to the client instead of
+ # blank data or a 500.
+ raise
+ except DNSLookupError:
+ # DNS lookup returned no results
+ # Note: This will also be the case if one of the resolved IP
+ # addresses is blacklisted
+ raise SynapseError(
+ 502,
+ "DNS resolution failure during URL preview generation",
+ Codes.UNKNOWN,
+ )
+ except Exception as e:
+ # FIXME: pass through 404s and other error messages nicely
+ logger.warning("Error downloading %s: %r", url, e)
+
+ raise SynapseError(
+ 500,
+ "Failed to download content: %s"
+ % (traceback.format_exception_only(sys.exc_info()[0], e),),
+ Codes.UNKNOWN,
+ )
+
+ if b"Content-Type" in headers:
+ media_type = headers[b"Content-Type"][0].decode("ascii")
+ else:
+ media_type = "application/octet-stream"
+
+ download_name = get_filename_from_headers(headers)
+
+ # FIXME: we should calculate a proper expiration based on the
+ # Cache-Control and Expire headers. But for now, assume 1 hour.
+ expires = ONE_HOUR
+ etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
+
+ return DownloadResult(
+ length, uri, code, media_type, download_name, expires, etag
+ )
+
+ async def _parse_data_url(
+ self, url: str, output_stream: BinaryIO
+ ) -> DownloadResult:
+ """
+ Parses a data: URL.
+
+ Args:
+ url: The URL to parse.
+ output_stream: The stream to write the content to.
+
+ Returns:
+ A tuple of:
+ Media length, URL downloaded, the HTTP response code,
+ the media type, the downloaded file name, the number of
+ milliseconds the result is valid for, the etag header.
+ """
+
+ try:
+ logger.debug("Trying to parse data url '%s'", url)
+ with urlopen(url) as url_info:
+ # TODO Can this be more efficient.
+ output_stream.write(url_info.read())
+ except Exception as e:
+ logger.warning("Error parsing data: URL %s: %r", url, e)
+
+ raise SynapseError(
+ 500,
+ "Failed to parse data URL: %s"
+ % (traceback.format_exception_only(sys.exc_info()[0], e),),
+ Codes.UNKNOWN,
+ )
+
+ return DownloadResult(
+ # Read back the length that has been written.
+ length=output_stream.tell(),
+ uri=url,
+ # If it was parsed, consider this a 200 OK.
+ response_code=200,
+ # urlopen shoves the media-type from the data URL into the content type
+ # header object.
+ media_type=url_info.headers.get_content_type(),
+ # Some features are not supported by data: URLs.
+ download_name=None,
+ expires=ONE_HOUR,
+ etag=None,
+ )
+
+ async def _handle_url(
+ self, url: str, user: UserID, allow_data_urls: bool = False
+ ) -> MediaInfo:
+ """
+ Fetches content from a URL and parses the result to generate a MediaInfo.
+
+ It uses the media storage provider to persist the fetched content and
+ stores the mapping into the database.
+
+ Args:
+ url: The URL to fetch.
+ user: The user who ahs requested this URL.
+ allow_data_urls: True if data URLs should be allowed.
+
+ Returns:
+ A MediaInfo object describing the fetched content.
+ """
+
+ # TODO: we should probably honour robots.txt... except in practice
+ # we're most likely being explicitly triggered by a human rather than a
+ # bot, so are we really a robot?
+
+ file_id = datetime.date.today().isoformat() + "_" + random_string(16)
+
+ file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)
+
+ with self.media_storage.store_into_file(file_info) as (f, fname, finish):
+ if url.startswith("data:"):
+ if not allow_data_urls:
+ raise SynapseError(
+ 500, "Previewing of data: URLs is forbidden", Codes.UNKNOWN
+ )
+
+ download_result = await self._parse_data_url(url, f)
+ else:
+ download_result = await self._download_url(url, f)
+
+ await finish()
+
+ try:
+ time_now_ms = self.clock.time_msec()
+
+ await self.store.store_local_media(
+ media_id=file_id,
+ media_type=download_result.media_type,
+ time_now_ms=time_now_ms,
+ upload_name=download_result.download_name,
+ media_length=download_result.length,
+ user_id=user,
+ url_cache=url,
+ )
+
+ except Exception as e:
+ logger.error("Error handling downloaded %s: %r", url, e)
+ # TODO: we really ought to delete the downloaded file in this
+ # case, since we won't have recorded it in the db, and will
+ # therefore not expire it.
+ raise
+
+ return MediaInfo(
+ media_type=download_result.media_type,
+ media_length=download_result.length,
+ download_name=download_result.download_name,
+ created_ts_ms=time_now_ms,
+ filesystem_id=file_id,
+ filename=fname,
+ uri=download_result.uri,
+ response_code=download_result.response_code,
+ expires=download_result.expires,
+ etag=download_result.etag,
+ )
+
+ async def _precache_image_url(
+ self, user: UserID, media_info: MediaInfo, og: JsonDict
+ ) -> None:
+ """
+ Pre-cache the image (if one exists) for posterity
+
+ Args:
+ user: The user requesting the preview.
+ media_info: The media being previewed.
+ og: The Open Graph dictionary. This is modified with image information.
+ """
+ # If there's no image or it is blank, there's nothing to do.
+ if "og:image" not in og:
+ return
+
+ # Remove the raw image URL, this will be replaced with an MXC URL, if successful.
+ image_url = og.pop("og:image")
+ if not image_url:
+ return
+
+ # The image URL from the HTML might be relative to the previewed page,
+ # convert it to an URL which can be requested directly.
+ url_parts = urlparse(image_url)
+ if url_parts.scheme != "data":
+ image_url = urljoin(media_info.uri, image_url)
+
+ # FIXME: it might be cleaner to use the same flow as the main /preview_url
+ # request itself and benefit from the same caching etc. But for now we
+ # just rely on the caching on the master request to speed things up.
+ try:
+ image_info = await self._handle_url(image_url, user, allow_data_urls=True)
+ except Exception as e:
+ # Pre-caching the image failed, don't block the entire URL preview.
+ logger.warning(
+ "Pre-caching image failed during URL preview: %s errored with %s",
+ image_url,
+ e,
+ )
+ return
+
+ if _is_media(image_info.media_type):
+ # TODO: make sure we don't choke on white-on-transparent images
+ file_id = image_info.filesystem_id
+ dims = await self.media_repo._generate_thumbnails(
+ None, file_id, file_id, image_info.media_type, url_cache=True
+ )
+ if dims:
+ og["og:image:width"] = dims["width"]
+ og["og:image:height"] = dims["height"]
+ else:
+ logger.warning("Couldn't get dims for %s", image_url)
+
+ og["og:image"] = f"mxc://{self.server_name}/{image_info.filesystem_id}"
+ og["og:image:type"] = image_info.media_type
+ og["matrix:image:size"] = image_info.media_length
+
+ async def _handle_oembed_response(
+ self, url: str, media_info: MediaInfo, expiration_ms: int
+ ) -> Tuple[JsonDict, Optional[str], int]:
+ """
+ Parse the downloaded oEmbed info.
+
+ Args:
+ url: The URL which is being previewed (not the one which was
+ requested).
+ media_info: The media being previewed.
+ expiration_ms: The length of time, in milliseconds, the media is valid for.
+
+ Returns:
+ A tuple of:
+ The Open Graph dictionary, if the oEmbed info can be parsed.
+ The author name if it could be retrieved from oEmbed.
+ The (possibly updated) length of time, in milliseconds, the media is valid for.
+ """
+ # If JSON was not returned, there's nothing to do.
+ if not _is_json(media_info.media_type):
+ return {}, None, expiration_ms
+
+ with open(media_info.filename, "rb") as file:
+ body = file.read()
+
+ oembed_response = self._oembed.parse_oembed_response(url, body)
+ open_graph_result = oembed_response.open_graph_result
+
+ # Use the cache age from the oEmbed result, if one was given.
+ if open_graph_result and oembed_response.cache_age is not None:
+ expiration_ms = oembed_response.cache_age
+
+ return open_graph_result, oembed_response.author_name, expiration_ms
+
+ def _start_expire_url_cache_data(self) -> Deferred:
+ return run_as_background_process(
+ "expire_url_cache_data", self._expire_url_cache_data
+ )
+
+ async def _expire_url_cache_data(self) -> None:
+ """Clean up expired url cache content, media and thumbnails."""
+
+ assert self._worker_run_media_background_jobs
+
+ now = self.clock.time_msec()
+
+ logger.debug("Running url preview cache expiry")
+
+ def try_remove_parent_dirs(dirs: Iterable[str]) -> None:
+ """Attempt to remove the given chain of parent directories
+
+ Args:
+ dirs: The list of directory paths to delete, with children appearing
+ before their parents.
+ """
+ for dir in dirs:
+ try:
+ os.rmdir(dir)
+ except FileNotFoundError:
+ # Already deleted, continue with deleting the rest
+ pass
+ except OSError as e:
+ # Failed, skip deleting the rest of the parent dirs
+ if e.errno != errno.ENOTEMPTY:
+ logger.warning(
+ "Failed to remove media directory while clearing url preview cache: %r: %s",
+ dir,
+ e,
+ )
+ break
+
+ # First we delete expired url cache entries
+ media_ids = await self.store.get_expired_url_cache(now)
+
+ removed_media = []
+ for media_id in media_ids:
+ fname = self.filepaths.url_cache_filepath(media_id)
+ try:
+ os.remove(fname)
+ except FileNotFoundError:
+ pass # If the path doesn't exist, meh
+ except OSError as e:
+ logger.warning(
+ "Failed to remove media while clearing url preview cache: %r: %s",
+ media_id,
+ e,
+ )
+ continue
+
+ removed_media.append(media_id)
+
+ dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
+ try_remove_parent_dirs(dirs)
+
+ await self.store.delete_url_cache(removed_media)
+
+ if removed_media:
+ logger.debug(
+ "Deleted %d entries from url preview cache", len(removed_media)
+ )
+ else:
+ logger.debug("No entries removed from url preview cache")
+
+ # Now we delete old images associated with the url cache.
+ # These may be cached for a bit on the client (i.e., they
+ # may have a room open with a preview url thing open).
+ # So we wait a couple of days before deleting, just in case.
+ expire_before = now - IMAGE_CACHE_EXPIRY_MS
+ media_ids = await self.store.get_url_cache_media_before(expire_before)
+
+ removed_media = []
+ for media_id in media_ids:
+ fname = self.filepaths.url_cache_filepath(media_id)
+ try:
+ os.remove(fname)
+ except FileNotFoundError:
+ pass # If the path doesn't exist, meh
+ except OSError as e:
+ logger.warning(
+ "Failed to remove media from url preview cache: %r: %s", media_id, e
+ )
+ continue
+
+ dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
+ try_remove_parent_dirs(dirs)
+
+ thumbnail_dir = self.filepaths.url_cache_thumbnail_directory(media_id)
+ try:
+ shutil.rmtree(thumbnail_dir)
+ except FileNotFoundError:
+ pass # If the path doesn't exist, meh
+ except OSError as e:
+ logger.warning(
+ "Failed to remove media from url preview cache: %r: %s", media_id, e
+ )
+ continue
+
+ removed_media.append(media_id)
+
+ dirs = self.filepaths.url_cache_thumbnail_dirs_to_delete(media_id)
+ # Note that one of the directories to be deleted has already been
+ # removed by the `rmtree` above.
+ try_remove_parent_dirs(dirs)
+
+ await self.store.delete_url_cache_media(removed_media)
+
+ if removed_media:
+ logger.debug("Deleted %d media from url preview cache", len(removed_media))
+ else:
+ logger.debug("No media removed from url preview cache")
+
+
+def _is_media(content_type: str) -> bool:
+ return content_type.lower().startswith("image/")
+
+
+def _is_html(content_type: str) -> bool:
+ content_type = content_type.lower()
+ return content_type.startswith("text/html") or content_type.startswith(
+ "application/xhtml"
+ )
+
+
+def _is_json(content_type: str) -> bool:
+ return content_type.lower().startswith("application/json")
+
+
+def _is_previewable(content_type: str) -> bool:
+ """Returns True for content types for which we will perform URL preview and False
+ otherwise."""
+
+ return _is_html(content_type) or _is_media(content_type) or _is_json(content_type)
|