summary refs log tree commit diff
path: root/synapse/storage/__init__.py
diff options
context:
space:
mode:
authorNeil Johnson <neil@fragile.org.uk>2018-05-14 13:50:58 +0100
committerNeil Johnson <neil@fragile.org.uk>2018-05-14 13:50:58 +0100
commitf077e97914c9b5c82c94786130d98af52516cde0 (patch)
tree67881c17462bcc13a06e92c23608c9b15dfdc9d1 /synapse/storage/__init__.py
parentMerge branch 'develop' of https://github.com/matrix-org/synapse into cohort_a... (diff)
downloadsynapse-f077e97914c9b5c82c94786130d98af52516cde0.tar.xz
instead of inserting user daily visit data at the end of the day, instead insert incrementally through the day
Diffstat (limited to 'synapse/storage/__init__.py')
-rw-r--r--synapse/storage/__init__.py54
1 files changed, 14 insertions, 40 deletions
diff --git a/synapse/storage/__init__.py b/synapse/storage/__init__.py
index b51cf70336..6949876c13 100644
--- a/synapse/storage/__init__.py
+++ b/synapse/storage/__init__.py
@@ -353,48 +353,22 @@ class DataStore(RoomMemberStore, RoomStore,
         Generates daily visit data for use in cohort/ retention analysis
         """
         def _generate_user_daily_visits(txn):
-            logger.info("Calling _generate_user_daily_visits")
-            # determine timestamp of previous days
-            yesterday = datetime.datetime.utcnow() - datetime.timedelta(days=1)
-            yesterday_start = datetime.datetime(yesterday.year, yesterday.month,
-                                                yesterday.day, tzinfo=tz.tzutc())
-            yesterday_start_time = int(time.mktime(yesterday_start.timetuple())) * 1000
-
-            # Check that this job has not already been completed
-            sql = """
-                SELECT timestamp
-                FROM user_daily_visits
-                ORDER by timestamp desc limit 1
-            """
-            txn.execute(sql)
-            row = txn.fetchone()
-
-            # Bail if the most recent time is yesterday
-            if row and row[0] == yesterday_start_time:
-                return
-
-            # Not specificying an upper bound means that if the update is run at
-            # 10 mins past midnight and the user is active during a 30 min session
-            # that the user is still included in the previous days stats
-            # This does mean that if the update is run hours late, then it is possible
-            # to overstate the cohort, but this seems a reasonable trade off
-            # The alternative is to insert on every request - but prefer to avoid
-            # for performance reasons
-            sql = """
-                    SELECT user_id, device_id
-                    FROM user_ips
-                    WHERE last_seen > ?
-            """
-            txn.execute(sql, (yesterday_start_time,))
-            user_visits = txn.fetchall()
 
+            # determine timestamp of the day start
+            now = datetime.datetime.utcnow()
+            today_start = datetime.datetime(now.year, now.month,
+                                            now.day, tzinfo=tz.tzutc())
+            today_start_time = int(time.mktime(today_start.timetuple())) * 1000
+            logger.info(today_start_time)
             sql = """
-                    INSERT INTO user_daily_visits (user_id, device_id, timestamp)
-                    VALUES (?, ?, ?)
-            """
-
-            for visit in user_visits:
-                txn.execute(sql, (visit + (yesterday_start_time,)))
+                INSERT INTO user_daily_visits (user_id, device_id, timestamp)
+                SELECT user_id, device_id, ?
+                FROM user_ips AS u
+                LEFT JOIN user_daily_visits USING (user_id, device_id)
+                WHERE last_seen > ? AND timestamp IS NULL
+                GROUP BY user_id, device_id;
+                """
+            txn.execute(sql, (today_start_time, today_start_time))
 
         return self.runInteraction("generate_user_daily_visits",
                                    _generate_user_daily_visits)