diff options
author | Neil Johnson <neil@fragile.org.uk> | 2018-05-14 13:50:58 +0100 |
---|---|---|
committer | Neil Johnson <neil@fragile.org.uk> | 2018-05-14 13:50:58 +0100 |
commit | f077e97914c9b5c82c94786130d98af52516cde0 (patch) | |
tree | 67881c17462bcc13a06e92c23608c9b15dfdc9d1 /synapse/storage/__init__.py | |
parent | Merge branch 'develop' of https://github.com/matrix-org/synapse into cohort_a... (diff) | |
download | synapse-f077e97914c9b5c82c94786130d98af52516cde0.tar.xz |
instead of inserting user daily visit data at the end of the day, instead insert incrementally through the day
Diffstat (limited to 'synapse/storage/__init__.py')
-rw-r--r-- | synapse/storage/__init__.py | 54 |
1 files changed, 14 insertions, 40 deletions
diff --git a/synapse/storage/__init__.py b/synapse/storage/__init__.py index b51cf70336..6949876c13 100644 --- a/synapse/storage/__init__.py +++ b/synapse/storage/__init__.py @@ -353,48 +353,22 @@ class DataStore(RoomMemberStore, RoomStore, Generates daily visit data for use in cohort/ retention analysis """ def _generate_user_daily_visits(txn): - logger.info("Calling _generate_user_daily_visits") - # determine timestamp of previous days - yesterday = datetime.datetime.utcnow() - datetime.timedelta(days=1) - yesterday_start = datetime.datetime(yesterday.year, yesterday.month, - yesterday.day, tzinfo=tz.tzutc()) - yesterday_start_time = int(time.mktime(yesterday_start.timetuple())) * 1000 - - # Check that this job has not already been completed - sql = """ - SELECT timestamp - FROM user_daily_visits - ORDER by timestamp desc limit 1 - """ - txn.execute(sql) - row = txn.fetchone() - - # Bail if the most recent time is yesterday - if row and row[0] == yesterday_start_time: - return - - # Not specificying an upper bound means that if the update is run at - # 10 mins past midnight and the user is active during a 30 min session - # that the user is still included in the previous days stats - # This does mean that if the update is run hours late, then it is possible - # to overstate the cohort, but this seems a reasonable trade off - # The alternative is to insert on every request - but prefer to avoid - # for performance reasons - sql = """ - SELECT user_id, device_id - FROM user_ips - WHERE last_seen > ? - """ - txn.execute(sql, (yesterday_start_time,)) - user_visits = txn.fetchall() + # determine timestamp of the day start + now = datetime.datetime.utcnow() + today_start = datetime.datetime(now.year, now.month, + now.day, tzinfo=tz.tzutc()) + today_start_time = int(time.mktime(today_start.timetuple())) * 1000 + logger.info(today_start_time) sql = """ - INSERT INTO user_daily_visits (user_id, device_id, timestamp) - VALUES (?, ?, ?) - """ - - for visit in user_visits: - txn.execute(sql, (visit + (yesterday_start_time,))) + INSERT INTO user_daily_visits (user_id, device_id, timestamp) + SELECT user_id, device_id, ? + FROM user_ips AS u + LEFT JOIN user_daily_visits USING (user_id, device_id) + WHERE last_seen > ? AND timestamp IS NULL + GROUP BY user_id, device_id; + """ + txn.execute(sql, (today_start_time, today_start_time)) return self.runInteraction("generate_user_daily_visits", _generate_user_daily_visits) |