summary refs log tree commit diff
path: root/synapse
diff options
context:
space:
mode:
authorNeil Johnson <neil@fragile.org.uk>2018-05-15 17:01:33 +0100
committerNeil Johnson <neil@fragile.org.uk>2018-05-15 17:01:33 +0100
commit05ac15ae824cc538b869e3cc8db7af2ac22e6754 (patch)
treee0df99cd94840ce5e811d812919d030c7c1944b3 /synapse
parentinstead of inserting user daily visit data at the end of the day, instead ins... (diff)
downloadsynapse-05ac15ae824cc538b869e3cc8db7af2ac22e6754.tar.xz
Limit query load of generate_user_daily_visits
The aim is to keep track of when it was last called and only query from that point in time
Diffstat (limited to 'synapse')
-rwxr-xr-xsynapse/app/homeserver.py21
-rw-r--r--synapse/storage/__init__.py60
2 files changed, 54 insertions, 27 deletions
diff --git a/synapse/app/homeserver.py b/synapse/app/homeserver.py
index bfc79a5e81..f25eaf9ffc 100755
--- a/synapse/app/homeserver.py
+++ b/synapse/app/homeserver.py
@@ -476,23 +476,16 @@ def run(hs):
                 " changes across releases."
             )
 
-    # def recurring_user_daily_visit_stats():
-
     def generate_user_daily_visit_stats():
         hs.get_datastore().generate_user_daily_visits()
 
-    # Since user daily stats are bucketed at midnight UTC,
-    # and user_ips.last_seen can be updated at any time, it is important to call
-    # generate_user_daily_visit_stats immediately prior to the day end. Assuming
-    # an hourly cadence, the simplist way is to allign all calls to the hour
-    # end
-    end_of_hour = datetime.datetime.now().replace(microsecond=0, second=0, minute=0) \
-        + datetime.timedelta(hours=1) \
-        - datetime.timedelta(seconds=10)  # Ensure method fires before day transistion
-
-    time_to_next_hour = end_of_hour - datetime.datetime.now()
-    clock.call_later(time_to_next_hour.seconds,
-                     clock.looping_call(generate_user_daily_visit_stats, 60 * 60 * 1000))
+    def recurring_user_daily_visit_stats():
+        clock.looping_call(generate_user_daily_visit_stats, 60 * 60 * 1000)
+
+    # Rather than update on per session basis, batch up the requests.
+    # If you increase the loop period, the accuracy of user_daily_visits
+    # table will decrease
+    clock.looping_call(generate_user_daily_visit_stats, 5 * 60 * 1000)
 
     if hs.config.report_stats:
         logger.info("Scheduling stats reporting for 3 hour intervals")
diff --git a/synapse/storage/__init__.py b/synapse/storage/__init__.py
index 6949876c13..52f176a03c 100644
--- a/synapse/storage/__init__.py
+++ b/synapse/storage/__init__.py
@@ -214,6 +214,9 @@ class DataStore(RoomMemberStore, RoomStore,
         self._stream_order_on_start = self.get_room_max_stream_ordering()
         self._min_stream_order_on_start = self.get_room_min_stream_ordering()
 
+        # Used in _generate_user_daily_visits to keep track of progress
+        self._last_user_visit_update = self._get_start_of_day()
+
         super(DataStore, self).__init__(db_conn, hs)
 
     def take_presence_startup_info(self):
@@ -348,27 +351,58 @@ class DataStore(RoomMemberStore, RoomStore,
 
         return self.runInteraction("count_r30_users", _count_r30_users)
 
+    def _get_start_of_day(self):
+        """
+        Returns millisecond unixtime for start of UTC day.
+        """
+        now = datetime.datetime.utcnow()
+        today_start = datetime.datetime(now.year, now.month,
+                                        now.day, tzinfo=tz.tzutc())
+        return int(time.mktime(today_start.timetuple())) * 1000
+
     def generate_user_daily_visits(self):
         """
         Generates daily visit data for use in cohort/ retention analysis
         """
         def _generate_user_daily_visits(txn):
+            logger.info("Calling _generate_user_daily_visits")
+            today_start = self._get_start_of_day()
+            a_day_in_milliseconds = 24 * 60 * 60 * 1000
 
-            # determine timestamp of the day start
-            now = datetime.datetime.utcnow()
-            today_start = datetime.datetime(now.year, now.month,
-                                            now.day, tzinfo=tz.tzutc())
-            today_start_time = int(time.mktime(today_start.timetuple())) * 1000
-            logger.info(today_start_time)
             sql = """
                 INSERT INTO user_daily_visits (user_id, device_id, timestamp)
-                SELECT user_id, device_id, ?
-                FROM user_ips AS u
-                LEFT JOIN user_daily_visits USING (user_id, device_id)
-                WHERE last_seen > ? AND timestamp IS NULL
-                GROUP BY user_id, device_id;
-                """
-            txn.execute(sql, (today_start_time, today_start_time))
+                    SELECT u.user_id, u.device_id, ?
+                    FROM user_ips AS u
+                    LEFT JOIN (
+                      SELECT user_id, device_id, timestamp FROM user_daily_visits
+                      WHERE timestamp IS ?
+                    ) udv
+                    ON u.user_id = udv.user_id AND u.device_id=udv.device_id
+                    WHERE last_seen > ? AND last_seen <= ? AND udv.timestamp IS NULL
+            """
+
+            # This means that the day has rolled over but there could still
+            # be entries from the previous day. There is an edge case
+            # where if the user logs in at 23:59 and overwrites their
+            # last_seen at 00:01 then they will not be counted in the
+            # previous day's stats - it is important that the query is run
+            # to minimise this case.
+            if today_start > self._last_user_visit_update:
+                yesterday_start = today_start - a_day_in_milliseconds
+                txn.execute(sql, (yesterday_start, yesterday_start,
+                                  self._last_user_visit_update, today_start))
+                self._last_user_visit_update = today_start
+
+            txn.execute(sql, (today_start, today_start,
+                              self._last_user_visit_update,
+                              today_start + a_day_in_milliseconds))
+            # Update _last_user_visit_update to now. The reason to do this
+            # rather just clamping to the beginning of the day is to limit
+            # the size of the join - meaning that the query can be run more
+            # frequently
+
+            now = datetime.datetime.utcnow()
+            self._last_user_visit_update = int(time.mktime(now.timetuple())) * 1000
 
         return self.runInteraction("generate_user_daily_visits",
                                    _generate_user_daily_visits)