From 617bf40924bc8d627734eb2fb25a9f3980ede5d5 Mon Sep 17 00:00:00 2001 From: Neil Johnson Date: Wed, 25 Apr 2018 17:37:29 +0100 Subject: Generate user daily stats --- synapse/storage/__init__.py | 60 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 4 deletions(-) (limited to 'synapse/storage/__init__.py') diff --git a/synapse/storage/__init__.py b/synapse/storage/__init__.py index 8cdfd50f90..6f2f947433 100644 --- a/synapse/storage/__init__.py +++ b/synapse/storage/__init__.py @@ -14,6 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime +import time +import logging + from synapse.storage.devices import DeviceStore from .appservice import ( ApplicationServiceStore, ApplicationServiceTransactionStore @@ -55,10 +59,6 @@ from .engines import PostgresEngine from synapse.api.constants import PresenceState from synapse.util.caches.stream_change_cache import StreamChangeCache - -import logging - - logger = logging.getLogger(__name__) @@ -347,6 +347,58 @@ class DataStore(RoomMemberStore, RoomStore, return self.runInteraction("count_r30_users", _count_r30_users) + + def generate_user_daily_visits(self): + """ + Generates daily visit data for use in cohort/ retention analysis + """ + def _generate_user_daily_visits(txn): + logger.info("Calling _generate_user_daily_visits") + # determine timestamp of previous days + yesterday = datetime.datetime.now() - datetime.timedelta(days=1) + yesterday_start = datetime.datetime(yesterday.year, + yesterday.month, + yesterday.day, 0, 0, 0, 0) + yesterday_start_time = int(time.mktime(yesterday_start.timetuple())) * 1000 + + # Check that this job has not already been completed + sql = """ + SELECT timestamp + FROM user_daily_visits + ORDER by timestamp desc limit 1 + """ + txn.execute(sql) + row = txn.fetchone() + + # Bail if the most recent time is yesterday + if row and row[0] == yesterday_start_time: + logger.info("Bailing from _generate_user_daily_visits, already completed") + return + logger.info("inserting into user_daily_visits") + # Not specificying an upper bound means that if the update is run at + # 10 mins past midnight and the user is active during a 30 min session + # that the user is still included in the previous days stats + # This does mean that if the update is run hours late, then it is possible + # to overstate the cohort, but this seems a reasonable trade off + # The alternative is to insert on every request - but prefer to avoid + # for performance reasons + sql = """ + SELECT user_id, user_agent, device_id + FROM user_ips + WHERE last_seen > ? + """ + txn.execute(sql, (yesterday_start_time,)) + + sql = """ + INSERT INTO user_daily_visits (user_id, user_agent, device_id, timestamp) + VALUES (?, ?, ?, ?) + """ + + for row in txn: + txn.execute(sql, (row + (yesterday_start_time,))) + + return self.runInteraction("generate_user_daily_visits", _generate_user_daily_visits) + def get_users(self): """Function to reterive a list of users in users table. -- cgit 1.5.1 From fb6015d0a6828d57c18d601ea36b1a8a4cebc0e2 Mon Sep 17 00:00:00 2001 From: Neil Johnson Date: Wed, 25 Apr 2018 17:56:11 +0100 Subject: pep8 --- synapse/storage/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'synapse/storage/__init__.py') diff --git a/synapse/storage/__init__.py b/synapse/storage/__init__.py index 6f2f947433..c22d38800c 100644 --- a/synapse/storage/__init__.py +++ b/synapse/storage/__init__.py @@ -347,7 +347,6 @@ class DataStore(RoomMemberStore, RoomStore, return self.runInteraction("count_r30_users", _count_r30_users) - def generate_user_daily_visits(self): """ Generates daily visit data for use in cohort/ retention analysis @@ -390,14 +389,16 @@ class DataStore(RoomMemberStore, RoomStore, txn.execute(sql, (yesterday_start_time,)) sql = """ - INSERT INTO user_daily_visits (user_id, user_agent, device_id, timestamp) + INSERT INTO user_daily_visits (user_id. user_agent, + device_id, timestamp) VALUES (?, ?, ?, ?) """ for row in txn: txn.execute(sql, (row + (yesterday_start_time,))) - return self.runInteraction("generate_user_daily_visits", _generate_user_daily_visits) + return self.runInteraction("generate_user_daily_visits", + _generate_user_daily_visits) def get_users(self): """Function to reterive a list of users in users table. -- cgit 1.5.1 From dd1a8324195f37d508d518db10ba37d9028afc86 Mon Sep 17 00:00:00 2001 From: Neil Johnson Date: Tue, 1 May 2018 12:13:49 +0100 Subject: remove user agent from data model, will just join on user_ips --- synapse/storage/__init__.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'synapse/storage/__init__.py') diff --git a/synapse/storage/__init__.py b/synapse/storage/__init__.py index c22d38800c..b51cf70336 100644 --- a/synapse/storage/__init__.py +++ b/synapse/storage/__init__.py @@ -15,6 +15,7 @@ # limitations under the License. import datetime +from dateutil import tz import time import logging @@ -354,10 +355,9 @@ class DataStore(RoomMemberStore, RoomStore, def _generate_user_daily_visits(txn): logger.info("Calling _generate_user_daily_visits") # determine timestamp of previous days - yesterday = datetime.datetime.now() - datetime.timedelta(days=1) - yesterday_start = datetime.datetime(yesterday.year, - yesterday.month, - yesterday.day, 0, 0, 0, 0) + yesterday = datetime.datetime.utcnow() - datetime.timedelta(days=1) + yesterday_start = datetime.datetime(yesterday.year, yesterday.month, + yesterday.day, tzinfo=tz.tzutc()) yesterday_start_time = int(time.mktime(yesterday_start.timetuple())) * 1000 # Check that this job has not already been completed @@ -371,9 +371,8 @@ class DataStore(RoomMemberStore, RoomStore, # Bail if the most recent time is yesterday if row and row[0] == yesterday_start_time: - logger.info("Bailing from _generate_user_daily_visits, already completed") return - logger.info("inserting into user_daily_visits") + # Not specificying an upper bound means that if the update is run at # 10 mins past midnight and the user is active during a 30 min session # that the user is still included in the previous days stats @@ -382,20 +381,20 @@ class DataStore(RoomMemberStore, RoomStore, # The alternative is to insert on every request - but prefer to avoid # for performance reasons sql = """ - SELECT user_id, user_agent, device_id + SELECT user_id, device_id FROM user_ips WHERE last_seen > ? """ txn.execute(sql, (yesterday_start_time,)) + user_visits = txn.fetchall() sql = """ - INSERT INTO user_daily_visits (user_id. user_agent, - device_id, timestamp) - VALUES (?, ?, ?, ?) + INSERT INTO user_daily_visits (user_id, device_id, timestamp) + VALUES (?, ?, ?) """ - for row in txn: - txn.execute(sql, (row + (yesterday_start_time,))) + for visit in user_visits: + txn.execute(sql, (visit + (yesterday_start_time,))) return self.runInteraction("generate_user_daily_visits", _generate_user_daily_visits) -- cgit 1.5.1 From f077e97914c9b5c82c94786130d98af52516cde0 Mon Sep 17 00:00:00 2001 From: Neil Johnson Date: Mon, 14 May 2018 13:50:58 +0100 Subject: instead of inserting user daily visit data at the end of the day, instead insert incrementally through the day --- synapse/app/homeserver.py | 19 +++++++++++++--- synapse/storage/__init__.py | 54 ++++++++++++--------------------------------- 2 files changed, 30 insertions(+), 43 deletions(-) (limited to 'synapse/storage/__init__.py') diff --git a/synapse/app/homeserver.py b/synapse/app/homeserver.py index f785a7a22b..bfc79a5e81 100755 --- a/synapse/app/homeserver.py +++ b/synapse/app/homeserver.py @@ -17,6 +17,7 @@ import gc import logging import os import sys +import datetime import synapse import synapse.config.logger @@ -475,9 +476,24 @@ def run(hs): " changes across releases." ) + # def recurring_user_daily_visit_stats(): + def generate_user_daily_visit_stats(): hs.get_datastore().generate_user_daily_visits() + # Since user daily stats are bucketed at midnight UTC, + # and user_ips.last_seen can be updated at any time, it is important to call + # generate_user_daily_visit_stats immediately prior to the day end. Assuming + # an hourly cadence, the simplist way is to allign all calls to the hour + # end + end_of_hour = datetime.datetime.now().replace(microsecond=0, second=0, minute=0) \ + + datetime.timedelta(hours=1) \ + - datetime.timedelta(seconds=10) # Ensure method fires before day transistion + + time_to_next_hour = end_of_hour - datetime.datetime.now() + clock.call_later(time_to_next_hour.seconds, + clock.looping_call(generate_user_daily_visit_stats, 60 * 60 * 1000)) + if hs.config.report_stats: logger.info("Scheduling stats reporting for 3 hour intervals") clock.looping_call(phone_stats_home, 3 * 60 * 60 * 1000) @@ -490,9 +506,6 @@ def run(hs): # be quite busy the first few minutes clock.call_later(5 * 60, phone_stats_home) - clock.looping_call(generate_user_daily_visit_stats, 10 * 60 * 1000) - clock.call_later(5 * 60, generate_user_daily_visit_stats) - if hs.config.daemonize and hs.config.print_pidfile: print (hs.config.pid_file) diff --git a/synapse/storage/__init__.py b/synapse/storage/__init__.py index b51cf70336..6949876c13 100644 --- a/synapse/storage/__init__.py +++ b/synapse/storage/__init__.py @@ -353,48 +353,22 @@ class DataStore(RoomMemberStore, RoomStore, Generates daily visit data for use in cohort/ retention analysis """ def _generate_user_daily_visits(txn): - logger.info("Calling _generate_user_daily_visits") - # determine timestamp of previous days - yesterday = datetime.datetime.utcnow() - datetime.timedelta(days=1) - yesterday_start = datetime.datetime(yesterday.year, yesterday.month, - yesterday.day, tzinfo=tz.tzutc()) - yesterday_start_time = int(time.mktime(yesterday_start.timetuple())) * 1000 - - # Check that this job has not already been completed - sql = """ - SELECT timestamp - FROM user_daily_visits - ORDER by timestamp desc limit 1 - """ - txn.execute(sql) - row = txn.fetchone() - - # Bail if the most recent time is yesterday - if row and row[0] == yesterday_start_time: - return - - # Not specificying an upper bound means that if the update is run at - # 10 mins past midnight and the user is active during a 30 min session - # that the user is still included in the previous days stats - # This does mean that if the update is run hours late, then it is possible - # to overstate the cohort, but this seems a reasonable trade off - # The alternative is to insert on every request - but prefer to avoid - # for performance reasons - sql = """ - SELECT user_id, device_id - FROM user_ips - WHERE last_seen > ? - """ - txn.execute(sql, (yesterday_start_time,)) - user_visits = txn.fetchall() + # determine timestamp of the day start + now = datetime.datetime.utcnow() + today_start = datetime.datetime(now.year, now.month, + now.day, tzinfo=tz.tzutc()) + today_start_time = int(time.mktime(today_start.timetuple())) * 1000 + logger.info(today_start_time) sql = """ - INSERT INTO user_daily_visits (user_id, device_id, timestamp) - VALUES (?, ?, ?) - """ - - for visit in user_visits: - txn.execute(sql, (visit + (yesterday_start_time,))) + INSERT INTO user_daily_visits (user_id, device_id, timestamp) + SELECT user_id, device_id, ? + FROM user_ips AS u + LEFT JOIN user_daily_visits USING (user_id, device_id) + WHERE last_seen > ? AND timestamp IS NULL + GROUP BY user_id, device_id; + """ + txn.execute(sql, (today_start_time, today_start_time)) return self.runInteraction("generate_user_daily_visits", _generate_user_daily_visits) -- cgit 1.5.1 From 05ac15ae824cc538b869e3cc8db7af2ac22e6754 Mon Sep 17 00:00:00 2001 From: Neil Johnson Date: Tue, 15 May 2018 17:01:33 +0100 Subject: Limit query load of generate_user_daily_visits The aim is to keep track of when it was last called and only query from that point in time --- synapse/app/homeserver.py | 21 ++++++---------- synapse/storage/__init__.py | 60 +++++++++++++++++++++++++++++++++++---------- 2 files changed, 54 insertions(+), 27 deletions(-) (limited to 'synapse/storage/__init__.py') diff --git a/synapse/app/homeserver.py b/synapse/app/homeserver.py index bfc79a5e81..f25eaf9ffc 100755 --- a/synapse/app/homeserver.py +++ b/synapse/app/homeserver.py @@ -476,23 +476,16 @@ def run(hs): " changes across releases." ) - # def recurring_user_daily_visit_stats(): - def generate_user_daily_visit_stats(): hs.get_datastore().generate_user_daily_visits() - # Since user daily stats are bucketed at midnight UTC, - # and user_ips.last_seen can be updated at any time, it is important to call - # generate_user_daily_visit_stats immediately prior to the day end. Assuming - # an hourly cadence, the simplist way is to allign all calls to the hour - # end - end_of_hour = datetime.datetime.now().replace(microsecond=0, second=0, minute=0) \ - + datetime.timedelta(hours=1) \ - - datetime.timedelta(seconds=10) # Ensure method fires before day transistion - - time_to_next_hour = end_of_hour - datetime.datetime.now() - clock.call_later(time_to_next_hour.seconds, - clock.looping_call(generate_user_daily_visit_stats, 60 * 60 * 1000)) + def recurring_user_daily_visit_stats(): + clock.looping_call(generate_user_daily_visit_stats, 60 * 60 * 1000) + + # Rather than update on per session basis, batch up the requests. + # If you increase the loop period, the accuracy of user_daily_visits + # table will decrease + clock.looping_call(generate_user_daily_visit_stats, 5 * 60 * 1000) if hs.config.report_stats: logger.info("Scheduling stats reporting for 3 hour intervals") diff --git a/synapse/storage/__init__.py b/synapse/storage/__init__.py index 6949876c13..52f176a03c 100644 --- a/synapse/storage/__init__.py +++ b/synapse/storage/__init__.py @@ -214,6 +214,9 @@ class DataStore(RoomMemberStore, RoomStore, self._stream_order_on_start = self.get_room_max_stream_ordering() self._min_stream_order_on_start = self.get_room_min_stream_ordering() + # Used in _generate_user_daily_visits to keep track of progress + self._last_user_visit_update = self._get_start_of_day() + super(DataStore, self).__init__(db_conn, hs) def take_presence_startup_info(self): @@ -348,27 +351,58 @@ class DataStore(RoomMemberStore, RoomStore, return self.runInteraction("count_r30_users", _count_r30_users) + def _get_start_of_day(self): + """ + Returns millisecond unixtime for start of UTC day. + """ + now = datetime.datetime.utcnow() + today_start = datetime.datetime(now.year, now.month, + now.day, tzinfo=tz.tzutc()) + return int(time.mktime(today_start.timetuple())) * 1000 + def generate_user_daily_visits(self): """ Generates daily visit data for use in cohort/ retention analysis """ def _generate_user_daily_visits(txn): + logger.info("Calling _generate_user_daily_visits") + today_start = self._get_start_of_day() + a_day_in_milliseconds = 24 * 60 * 60 * 1000 - # determine timestamp of the day start - now = datetime.datetime.utcnow() - today_start = datetime.datetime(now.year, now.month, - now.day, tzinfo=tz.tzutc()) - today_start_time = int(time.mktime(today_start.timetuple())) * 1000 - logger.info(today_start_time) sql = """ INSERT INTO user_daily_visits (user_id, device_id, timestamp) - SELECT user_id, device_id, ? - FROM user_ips AS u - LEFT JOIN user_daily_visits USING (user_id, device_id) - WHERE last_seen > ? AND timestamp IS NULL - GROUP BY user_id, device_id; - """ - txn.execute(sql, (today_start_time, today_start_time)) + SELECT u.user_id, u.device_id, ? + FROM user_ips AS u + LEFT JOIN ( + SELECT user_id, device_id, timestamp FROM user_daily_visits + WHERE timestamp IS ? + ) udv + ON u.user_id = udv.user_id AND u.device_id=udv.device_id + WHERE last_seen > ? AND last_seen <= ? AND udv.timestamp IS NULL + """ + + # This means that the day has rolled over but there could still + # be entries from the previous day. There is an edge case + # where if the user logs in at 23:59 and overwrites their + # last_seen at 00:01 then they will not be counted in the + # previous day's stats - it is important that the query is run + # to minimise this case. + if today_start > self._last_user_visit_update: + yesterday_start = today_start - a_day_in_milliseconds + txn.execute(sql, (yesterday_start, yesterday_start, + self._last_user_visit_update, today_start)) + self._last_user_visit_update = today_start + + txn.execute(sql, (today_start, today_start, + self._last_user_visit_update, + today_start + a_day_in_milliseconds)) + # Update _last_user_visit_update to now. The reason to do this + # rather just clamping to the beginning of the day is to limit + # the size of the join - meaning that the query can be run more + # frequently + + now = datetime.datetime.utcnow() + self._last_user_visit_update = int(time.mktime(now.timetuple())) * 1000 return self.runInteraction("generate_user_daily_visits", _generate_user_daily_visits) -- cgit 1.5.1 From 31c2502ca8d74797f1eae553690830b8c132aed9 Mon Sep 17 00:00:00 2001 From: Neil Johnson Date: Wed, 16 May 2018 09:46:43 +0100 Subject: style and further contraining query --- synapse/storage/__init__.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'synapse/storage/__init__.py') diff --git a/synapse/storage/__init__.py b/synapse/storage/__init__.py index 52f176a03c..6f324e075f 100644 --- a/synapse/storage/__init__.py +++ b/synapse/storage/__init__.py @@ -368,6 +368,7 @@ class DataStore(RoomMemberStore, RoomStore, logger.info("Calling _generate_user_daily_visits") today_start = self._get_start_of_day() a_day_in_milliseconds = 24 * 60 * 60 * 1000 + now = self.clock.time_msec() sql = """ INSERT INTO user_daily_visits (user_id, device_id, timestamp) @@ -386,23 +387,26 @@ class DataStore(RoomMemberStore, RoomStore, # where if the user logs in at 23:59 and overwrites their # last_seen at 00:01 then they will not be counted in the # previous day's stats - it is important that the query is run - # to minimise this case. + # often to minimise this case. if today_start > self._last_user_visit_update: yesterday_start = today_start - a_day_in_milliseconds - txn.execute(sql, (yesterday_start, yesterday_start, - self._last_user_visit_update, today_start)) + txn.execute(sql, ( + yesterday_start, yesterday_start, + self._last_user_visit_update, today_start + )) self._last_user_visit_update = today_start - txn.execute(sql, (today_start, today_start, - self._last_user_visit_update, - today_start + a_day_in_milliseconds)) + txn.execute(sql, ( + today_start, today_start, + self._last_user_visit_update, + now + )) # Update _last_user_visit_update to now. The reason to do this # rather just clamping to the beginning of the day is to limit # the size of the join - meaning that the query can be run more # frequently - now = datetime.datetime.utcnow() - self._last_user_visit_update = int(time.mktime(now.timetuple())) * 1000 + self._last_user_visit_update = now return self.runInteraction("generate_user_daily_visits", _generate_user_daily_visits) -- cgit 1.5.1 From be11a02c4f854452687f0f54492bc53b4827637d Mon Sep 17 00:00:00 2001 From: Neil Johnson Date: Wed, 16 May 2018 10:45:40 +0100 Subject: remove empty line --- synapse/storage/__init__.py | 1 - 1 file changed, 1 deletion(-) (limited to 'synapse/storage/__init__.py') diff --git a/synapse/storage/__init__.py b/synapse/storage/__init__.py index 6f324e075f..4551cf8774 100644 --- a/synapse/storage/__init__.py +++ b/synapse/storage/__init__.py @@ -405,7 +405,6 @@ class DataStore(RoomMemberStore, RoomStore, # rather just clamping to the beginning of the day is to limit # the size of the join - meaning that the query can be run more # frequently - self._last_user_visit_update = now return self.runInteraction("generate_user_daily_visits", -- cgit 1.5.1 From ef466b3a131c0a6c93027a8f8194538678f920c1 Mon Sep 17 00:00:00 2001 From: Neil Johnson Date: Fri, 18 May 2018 15:51:21 +0100 Subject: fix psql compatability bug --- synapse/storage/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'synapse/storage/__init__.py') diff --git a/synapse/storage/__init__.py b/synapse/storage/__init__.py index 4551cf8774..ac264b5d25 100644 --- a/synapse/storage/__init__.py +++ b/synapse/storage/__init__.py @@ -376,7 +376,7 @@ class DataStore(RoomMemberStore, RoomStore, FROM user_ips AS u LEFT JOIN ( SELECT user_id, device_id, timestamp FROM user_daily_visits - WHERE timestamp IS ? + WHERE timestamp = ? ) udv ON u.user_id = udv.user_id AND u.device_id=udv.device_id WHERE last_seen > ? AND last_seen <= ? AND udv.timestamp IS NULL -- cgit 1.5.1 From 644aac5f73dbacfe4cb47af09e804b9d1d2788a4 Mon Sep 17 00:00:00 2001 From: Neil Johnson Date: Fri, 18 May 2018 17:10:35 +0100 Subject: Tighter filtering for user_daily_visits --- synapse/storage/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'synapse/storage/__init__.py') diff --git a/synapse/storage/__init__.py b/synapse/storage/__init__.py index ac264b5d25..979fa22438 100644 --- a/synapse/storage/__init__.py +++ b/synapse/storage/__init__.py @@ -379,7 +379,11 @@ class DataStore(RoomMemberStore, RoomStore, WHERE timestamp = ? ) udv ON u.user_id = udv.user_id AND u.device_id=udv.device_id - WHERE last_seen > ? AND last_seen <= ? AND udv.timestamp IS NULL + INNER JOIN users ON users.name=u.user_id + WHERE last_seen > ? AND last_seen <= ? + AND udv.timestamp IS NULL AND users.is_guest=0 + AND users.appservice_id IS NULL + GROUP BY u.user_id, u.device_id """ # This means that the day has rolled over but there could still -- cgit 1.5.1