summary refs log tree commit diff
path: root/synapse
diff options
context:
space:
mode:
authorNeil Johnson <neil@fragile.org.uk>2018-05-14 13:50:58 +0100
committerNeil Johnson <neil@fragile.org.uk>2018-05-14 13:50:58 +0100
commitf077e97914c9b5c82c94786130d98af52516cde0 (patch)
tree67881c17462bcc13a06e92c23608c9b15dfdc9d1 /synapse
parentMerge branch 'develop' of https://github.com/matrix-org/synapse into cohort_a... (diff)
downloadsynapse-f077e97914c9b5c82c94786130d98af52516cde0.tar.xz
instead of inserting user daily visit data at the end of the day, instead insert incrementally through the day
Diffstat (limited to 'synapse')
-rwxr-xr-xsynapse/app/homeserver.py19
-rw-r--r--synapse/storage/__init__.py54
2 files changed, 30 insertions, 43 deletions
diff --git a/synapse/app/homeserver.py b/synapse/app/homeserver.py
index f785a7a22b..bfc79a5e81 100755
--- a/synapse/app/homeserver.py
+++ b/synapse/app/homeserver.py
@@ -17,6 +17,7 @@ import gc
 import logging
 import os
 import sys
+import datetime
 
 import synapse
 import synapse.config.logger
@@ -475,9 +476,24 @@ def run(hs):
                 " changes across releases."
             )
 
+    # def recurring_user_daily_visit_stats():
+
     def generate_user_daily_visit_stats():
         hs.get_datastore().generate_user_daily_visits()
 
+    # Since user daily stats are bucketed at midnight UTC,
+    # and user_ips.last_seen can be updated at any time, it is important to call
+    # generate_user_daily_visit_stats immediately prior to the day end. Assuming
+    # an hourly cadence, the simplist way is to allign all calls to the hour
+    # end
+    end_of_hour = datetime.datetime.now().replace(microsecond=0, second=0, minute=0) \
+        + datetime.timedelta(hours=1) \
+        - datetime.timedelta(seconds=10)  # Ensure method fires before day transistion
+
+    time_to_next_hour = end_of_hour - datetime.datetime.now()
+    clock.call_later(time_to_next_hour.seconds,
+                     clock.looping_call(generate_user_daily_visit_stats, 60 * 60 * 1000))
+
     if hs.config.report_stats:
         logger.info("Scheduling stats reporting for 3 hour intervals")
         clock.looping_call(phone_stats_home, 3 * 60 * 60 * 1000)
@@ -490,9 +506,6 @@ def run(hs):
         # be quite busy the first few minutes
         clock.call_later(5 * 60, phone_stats_home)
 
-    clock.looping_call(generate_user_daily_visit_stats, 10 * 60 * 1000)
-    clock.call_later(5 * 60, generate_user_daily_visit_stats)
-
     if hs.config.daemonize and hs.config.print_pidfile:
         print (hs.config.pid_file)
 
diff --git a/synapse/storage/__init__.py b/synapse/storage/__init__.py
index b51cf70336..6949876c13 100644
--- a/synapse/storage/__init__.py
+++ b/synapse/storage/__init__.py
@@ -353,48 +353,22 @@ class DataStore(RoomMemberStore, RoomStore,
         Generates daily visit data for use in cohort/ retention analysis
         """
         def _generate_user_daily_visits(txn):
-            logger.info("Calling _generate_user_daily_visits")
-            # determine timestamp of previous days
-            yesterday = datetime.datetime.utcnow() - datetime.timedelta(days=1)
-            yesterday_start = datetime.datetime(yesterday.year, yesterday.month,
-                                                yesterday.day, tzinfo=tz.tzutc())
-            yesterday_start_time = int(time.mktime(yesterday_start.timetuple())) * 1000
-
-            # Check that this job has not already been completed
-            sql = """
-                SELECT timestamp
-                FROM user_daily_visits
-                ORDER by timestamp desc limit 1
-            """
-            txn.execute(sql)
-            row = txn.fetchone()
-
-            # Bail if the most recent time is yesterday
-            if row and row[0] == yesterday_start_time:
-                return
-
-            # Not specificying an upper bound means that if the update is run at
-            # 10 mins past midnight and the user is active during a 30 min session
-            # that the user is still included in the previous days stats
-            # This does mean that if the update is run hours late, then it is possible
-            # to overstate the cohort, but this seems a reasonable trade off
-            # The alternative is to insert on every request - but prefer to avoid
-            # for performance reasons
-            sql = """
-                    SELECT user_id, device_id
-                    FROM user_ips
-                    WHERE last_seen > ?
-            """
-            txn.execute(sql, (yesterday_start_time,))
-            user_visits = txn.fetchall()
 
+            # determine timestamp of the day start
+            now = datetime.datetime.utcnow()
+            today_start = datetime.datetime(now.year, now.month,
+                                            now.day, tzinfo=tz.tzutc())
+            today_start_time = int(time.mktime(today_start.timetuple())) * 1000
+            logger.info(today_start_time)
             sql = """
-                    INSERT INTO user_daily_visits (user_id, device_id, timestamp)
-                    VALUES (?, ?, ?)
-            """
-
-            for visit in user_visits:
-                txn.execute(sql, (visit + (yesterday_start_time,)))
+                INSERT INTO user_daily_visits (user_id, device_id, timestamp)
+                SELECT user_id, device_id, ?
+                FROM user_ips AS u
+                LEFT JOIN user_daily_visits USING (user_id, device_id)
+                WHERE last_seen > ? AND timestamp IS NULL
+                GROUP BY user_id, device_id;
+                """
+            txn.execute(sql, (today_start_time, today_start_time))
 
         return self.runInteraction("generate_user_daily_visits",
                                    _generate_user_daily_visits)