summary refs log blame commit diff
path: root/scripts/synapse_port_db
blob: d4772fcf6ee23535f124de2c8debf60af52a2497 (plain) (tree)
1
                     




















                                                                          
             
              

                

           
                                             
 






                                       


























                                        
                   

 

                          
                    



                                                                           
                                        
                                     




                                                                                  



                                                                                          
 







                                                          
                                                                                    
                                           
                                                                          





                                                                                
                                                           


                                                
                                                                  
 




                                                    
                                                         




                                                   






                                       
 
 

                                    
 
                          
                                 
                                       





                                                                             
 
                                
                                  







                                                                 
 
                                      
 


                                                                                    
             





                                                                          
                                                         
                                                     

                                                        
 
                          

                                                                                
 
                                                                             

                                                                         
                          
                  
                                                                 
 


                                                                             
 



                                                                   
 
                                    
 
                                                                               
 
                    
                                            
                                           
                                                                              




                                                                                               
 






















                                                                   
 









                                                                   
 
                                                         
 
                                          
 








                                                                  
         
                                                 
 
                        
 
                          
            

                                                   
             
 









                                                                          


                                              
                                       
                                                        
                                                       
                                                           

                                                           
                                                      
















                                                                              
                                                      
















                                                                 
                                                 

                                                  
                 
                                           





                                                                               







                                           





                                                
 












                                                                   
                            































                                                                      
                                                        
 


                                                               
 

                                                     



































                                                                           
                                                                   































                                                                     
 












































                                                          

                                



                                                           




                                                                    



















                                                               



                                                                         


















































                                                                                       
                          








































                                                             
                          


                                                                             
                                                  
                        









                                                                         
     



                                                                      
 
                              
 







                                                                                     









                                             
                                                          
 

                                                     





                                                                   



















                                             
 

                                                                     
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2015 OpenMarket Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from twisted.internet import defer, reactor
from twisted.enterprise import adbapi

from synapse.storage._base import LoggingTransaction, SQLBaseStore
from synapse.storage.engines import create_engine

import argparse
import curses
import logging
import sys
import time
import traceback
import yaml


logger = logging.getLogger("synapse_port_db")


BOOLEAN_COLUMNS = {
    "events": ["processed", "outlier"],
    "rooms": ["is_public"],
    "event_edges": ["is_state"],
    "presence_list": ["accepted"],
}


APPEND_ONLY_TABLES = [
    "event_content_hashes",
    "event_reference_hashes",
    "event_signatures",
    "event_edge_hashes",
    "events",
    "event_json",
    "state_events",
    "room_memberships",
    "feedback",
    "topics",
    "room_names",
    "rooms",
    "local_media_repository",
    "local_media_repository_thumbnails",
    "remote_media_cache",
    "remote_media_cache_thumbnails",
    "redactions",
    "event_edges",
    "event_auth",
    "received_transactions",
    "sent_transactions",
    "transaction_id_to_pdu",
    "users",
    "state_groups",
    "state_groups_state",
    "event_to_state_groups",
    "rejections",
    "event_search",
]


end_error_exec_info = None


class Store(object):
    """This object is used to pull out some of the convenience API from the
    Storage layer.

    *All* database interactions should go through this object.
    """
    def __init__(self, db_pool, engine):
        self.db_pool = db_pool
        self.database_engine = engine

    _simple_insert_txn = SQLBaseStore.__dict__["_simple_insert_txn"]
    _simple_insert = SQLBaseStore.__dict__["_simple_insert"]

    _simple_select_onecol_txn = SQLBaseStore.__dict__["_simple_select_onecol_txn"]
    _simple_select_onecol = SQLBaseStore.__dict__["_simple_select_onecol"]
    _simple_select_one_onecol = SQLBaseStore.__dict__["_simple_select_one_onecol"]
    _simple_select_one_onecol_txn = SQLBaseStore.__dict__["_simple_select_one_onecol_txn"]

    _simple_update_one = SQLBaseStore.__dict__["_simple_update_one"]
    _simple_update_one_txn = SQLBaseStore.__dict__["_simple_update_one_txn"]

    def runInteraction(self, desc, func, *args, **kwargs):
        def r(conn):
            try:
                i = 0
                N = 5
                while True:
                    try:
                        txn = conn.cursor()
                        return func(
                            LoggingTransaction(txn, desc, self.database_engine, []),
                            *args, **kwargs
                        )
                    except self.database_engine.module.DatabaseError as e:
                        if self.database_engine.is_deadlock(e):
                            logger.warn("[TXN DEADLOCK] {%s} %d/%d", desc, i, N)
                            if i < N:
                                i += 1
                                conn.rollback()
                                continue
                        raise
            except Exception as e:
                logger.debug("[TXN FAIL] {%s} %s", desc, e)
                raise

        return self.db_pool.runWithConnection(r)

    def execute(self, f, *args, **kwargs):
        return self.runInteraction(f.__name__, f, *args, **kwargs)

    def execute_sql(self, sql, *args):
        def r(txn):
            txn.execute(sql, args)
            return txn.fetchall()
        return self.runInteraction("execute_sql", r)

    def insert_many_txn(self, txn, table, headers, rows):
        sql = "INSERT INTO %s (%s) VALUES (%s)" % (
            table,
            ", ".join(k for k in headers),
            ", ".join("%s" for _ in headers)
        )

        try:
            txn.executemany(sql, rows)
        except:
            logger.exception(
                "Failed to insert: %s",
                table,
            )
            raise


class Porter(object):
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

    @defer.inlineCallbacks
    def setup_table(self, table):
        if table in APPEND_ONLY_TABLES:
            # It's safe to just carry on inserting.
            next_chunk = yield self.postgres_store._simple_select_one_onecol(
                table="port_from_sqlite3",
                keyvalues={"table_name": table},
                retcol="rowid",
                allow_none=True,
            )

            total_to_port = None
            if next_chunk is None:
                if table == "sent_transactions":
                    next_chunk, already_ported, total_to_port = (
                        yield self._setup_sent_transactions()
                    )
                else:
                    yield self.postgres_store._simple_insert(
                        table="port_from_sqlite3",
                        values={"table_name": table, "rowid": 1}
                    )

                    next_chunk = 1
                    already_ported = 0

            if total_to_port is None:
                already_ported, total_to_port = yield self._get_total_count_to_port(
                    table, next_chunk
                )
        else:
            def delete_all(txn):
                txn.execute(
                    "DELETE FROM port_from_sqlite3 WHERE table_name = %s",
                    (table,)
                )
                txn.execute("TRUNCATE %s CASCADE" % (table,))

            yield self.postgres_store.execute(delete_all)

            yield self.postgres_store._simple_insert(
                table="port_from_sqlite3",
                values={"table_name": table, "rowid": 0}
            )

            next_chunk = 1

            already_ported, total_to_port = yield self._get_total_count_to_port(
                table, next_chunk
            )

        defer.returnValue((table, already_ported, total_to_port, next_chunk))

    @defer.inlineCallbacks
    def handle_table(self, table, postgres_size, table_size, next_chunk):
        if not table_size:
            return

        self.progress.add_table(table, postgres_size, table_size)

        select = (
            "SELECT rowid, * FROM %s WHERE rowid >= ? ORDER BY rowid LIMIT ?"
            % (table,)
        )

        while True:
            def r(txn):
                txn.execute(select, (next_chunk, self.batch_size,))
                rows = txn.fetchall()
                headers = [column[0] for column in txn.description]

                return headers, rows

            headers, rows = yield self.sqlite_store.runInteraction("select", r)

            if rows:
                next_chunk = rows[-1][0] + 1

                if table == "event_search":
                    # We have to treat event_search differently since it has a
                    # different structure in the two different databases.
                    def insert(txn):
                        sql = (
                            "INSERT INTO event_search (event_id, room_id, key, sender, vector)"
                            " VALUES (?,?,?,?,to_tsvector('english', ?))"
                        )

                        rows_dict = [
                            dict(zip(headers, row))
                            for row in rows
                        ]

                        txn.executemany(sql, [
                            (
                                row["event_id"],
                                row["room_id"],
                                row["key"],
                                row["sender"],
                                row["value"],
                            )
                            for row in rows_dict
                        ])

                        self.postgres_store._simple_update_one_txn(
                            txn,
                            table="port_from_sqlite3",
                            keyvalues={"table_name": table},
                            updatevalues={"rowid": next_chunk},
                        )
                else:
                    self._convert_rows(table, headers, rows)

                    def insert(txn):
                        self.postgres_store.insert_many_txn(
                            txn, table, headers[1:], rows
                        )

                        self.postgres_store._simple_update_one_txn(
                            txn,
                            table="port_from_sqlite3",
                            keyvalues={"table_name": table},
                            updatevalues={"rowid": next_chunk},
                        )

                yield self.postgres_store.execute(insert)

                postgres_size += len(rows)

                self.progress.update(table, postgres_size)
            else:
                return

    def setup_db(self, db_config, database_engine):
        db_conn = database_engine.module.connect(
            **{
                k: v for k, v in db_config.get("args", {}).items()
                if not k.startswith("cp_")
            }
        )

        database_engine.prepare_database(db_conn)

        db_conn.commit()

    @defer.inlineCallbacks
    def run(self):
        try:
            sqlite_db_pool = adbapi.ConnectionPool(
                self.sqlite_config["name"],
                **self.sqlite_config["args"]
            )

            postgres_db_pool = adbapi.ConnectionPool(
                self.postgres_config["name"],
                **self.postgres_config["args"]
            )

            sqlite_engine = create_engine("sqlite3")
            postgres_engine = create_engine("psycopg2")

            self.sqlite_store = Store(sqlite_db_pool, sqlite_engine)
            self.postgres_store = Store(postgres_db_pool, postgres_engine)

            yield self.postgres_store.execute(
                postgres_engine.check_database
            )

            # Step 1. Set up databases.
            self.progress.set_state("Preparing SQLite3")
            self.setup_db(sqlite_config, sqlite_engine)

            self.progress.set_state("Preparing PostgreSQL")
            self.setup_db(postgres_config, postgres_engine)

            # Step 2. Get tables.
            self.progress.set_state("Fetching tables")
            sqlite_tables = yield self.sqlite_store._simple_select_onecol(
                table="sqlite_master",
                keyvalues={
                    "type": "table",
                },
                retcol="name",
            )

            postgres_tables = yield self.postgres_store._simple_select_onecol(
                table="information_schema.tables",
                keyvalues={
                    "table_schema": "public",
                },
                retcol="distinct table_name",
            )

            tables = set(sqlite_tables) & set(postgres_tables)

            self.progress.set_state("Creating tables")

            logger.info("Found %d tables", len(tables))

            def create_port_table(txn):
                txn.execute(
                    "CREATE TABLE port_from_sqlite3 ("
                    " table_name varchar(100) NOT NULL UNIQUE,"
                    " rowid bigint NOT NULL"
                    ")"
                )

            try:
                yield self.postgres_store.runInteraction(
                    "create_port_table", create_port_table
                )
            except Exception as e:
                logger.info("Failed to create port table: %s", e)

            self.progress.set_state("Setting up")

            # Set up tables.
            setup_res = yield defer.gatherResults(
                [
                    self.setup_table(table)
                    for table in tables
                    if table not in ["schema_version", "applied_schema_deltas"]
                    and not table.startswith("sqlite_")
                ],
                consumeErrors=True,
            )

            # Process tables.
            yield defer.gatherResults(
                [
                    self.handle_table(*res)
                    for res in setup_res
                ],
                consumeErrors=True,
            )

            self.progress.done()
        except:
            global end_error_exec_info
            end_error_exec_info = sys.exc_info()
            logger.exception("")
        finally:
            reactor.stop()

    def _convert_rows(self, table, headers, rows):
        bool_col_names = BOOLEAN_COLUMNS.get(table, [])

        bool_cols = [
            i for i, h in enumerate(headers) if h in bool_col_names
        ]

        def conv(j, col):
            if j in bool_cols:
                return bool(col)
            return col

        for i, row in enumerate(rows):
            rows[i] = tuple(
                conv(j, col)
                for j, col in enumerate(row)
                if j > 0
            )

    @defer.inlineCallbacks
    def _setup_sent_transactions(self):
        # Only save things from the last day
        yesterday = int(time.time()*1000) - 86400000

        # And save the max transaction id from each destination
        select = (
            "SELECT rowid, * FROM sent_transactions WHERE rowid IN ("
            "SELECT max(rowid) FROM sent_transactions"
            " GROUP BY destination"
            ")"
        )

        def r(txn):
            txn.execute(select)
            rows = txn.fetchall()
            headers = [column[0] for column in txn.description]

            ts_ind = headers.index('ts')

            return headers, [r for r in rows if r[ts_ind] < yesterday]

        headers, rows = yield self.sqlite_store.runInteraction(
            "select", r,
        )

        self._convert_rows("sent_transactions", headers, rows)

        inserted_rows = len(rows)
        if inserted_rows:
            max_inserted_rowid = max(r[0] for r in rows)

            def insert(txn):
                self.postgres_store.insert_many_txn(
                    txn, "sent_transactions", headers[1:], rows
                )

            yield self.postgres_store.execute(insert)
        else:
            max_inserted_rowid = 0

        def get_start_id(txn):
            txn.execute(
                "SELECT rowid FROM sent_transactions WHERE ts >= ?"
                " ORDER BY rowid ASC LIMIT 1",
                (yesterday,)
            )

            rows = txn.fetchall()
            if rows:
                return rows[0][0]
            else:
                return 1

        next_chunk = yield self.sqlite_store.execute(get_start_id)
        next_chunk = max(max_inserted_rowid + 1, next_chunk)

        yield self.postgres_store._simple_insert(
            table="port_from_sqlite3",
            values={"table_name": "sent_transactions", "rowid": next_chunk}
        )

        def get_sent_table_size(txn):
            txn.execute(
                "SELECT count(*) FROM sent_transactions"
                " WHERE ts >= ?",
                (yesterday,)
            )
            size, = txn.fetchone()
            return int(size)

        remaining_count = yield self.sqlite_store.execute(
            get_sent_table_size
        )

        total_count = remaining_count + inserted_rows

        defer.returnValue((next_chunk, inserted_rows, total_count))

    @defer.inlineCallbacks
    def _get_remaining_count_to_port(self, table, next_chunk):
        rows = yield self.sqlite_store.execute_sql(
            "SELECT count(*) FROM %s WHERE rowid >= ?" % (table,),
            next_chunk,
        )

        defer.returnValue(rows[0][0])

    @defer.inlineCallbacks
    def _get_already_ported_count(self, table):
        rows = yield self.postgres_store.execute_sql(
            "SELECT count(*) FROM %s" % (table,),
        )

        defer.returnValue(rows[0][0])

    @defer.inlineCallbacks
    def _get_total_count_to_port(self, table, next_chunk):
        remaining, done = yield defer.gatherResults(
            [
                self._get_remaining_count_to_port(table, next_chunk),
                self._get_already_ported_count(table),
            ],
            consumeErrors=True,
        )

        remaining = int(remaining) if remaining else 0
        done = int(done) if done else 0

        defer.returnValue((done, remaining + done))


##############################################
###### The following is simply UI stuff ######
##############################################


class Progress(object):
    """Used to report progress of the port
    """
    def __init__(self):
        self.tables = {}

        self.start_time = int(time.time())

    def add_table(self, table, cur, size):
        self.tables[table] = {
            "start": cur,
            "num_done": cur,
            "total": size,
            "perc": int(cur * 100 / size),
        }

    def update(self, table, num_done):
        data = self.tables[table]
        data["num_done"] = num_done
        data["perc"] = int(num_done * 100 / data["total"])

    def done(self):
        pass


class CursesProgress(Progress):
    """Reports progress to a curses window
    """
    def __init__(self, stdscr):
        self.stdscr = stdscr

        curses.use_default_colors()
        curses.curs_set(0)

        curses.init_pair(1, curses.COLOR_RED, -1)
        curses.init_pair(2, curses.COLOR_GREEN, -1)

        self.last_update = 0

        self.finished = False

        self.total_processed = 0
        self.total_remaining = 0

        super(CursesProgress, self).__init__()

    def update(self, table, num_done):
        super(CursesProgress, self).update(table, num_done)

        self.total_processed = 0
        self.total_remaining = 0
        for table, data in self.tables.items():
            self.total_processed += data["num_done"] - data["start"]
            self.total_remaining += data["total"] - data["num_done"]

        self.render()

    def render(self, force=False):
        now = time.time()

        if not force and now - self.last_update < 0.2:
            # reactor.callLater(1, self.render)
            return

        self.stdscr.clear()

        rows, cols = self.stdscr.getmaxyx()

        duration = int(now) - int(self.start_time)

        minutes, seconds = divmod(duration, 60)
        duration_str = '%02dm %02ds' % (minutes, seconds,)

        if self.finished:
            status = "Time spent: %s (Done!)" % (duration_str,)
        else:

            if self.total_processed > 0:
                left = float(self.total_remaining) / self.total_processed

                est_remaining = (int(now) - self.start_time) * left
                est_remaining_str = '%02dm %02ds remaining' % divmod(est_remaining, 60)
            else:
                est_remaining_str = "Unknown"
            status = (
                "Time spent: %s (est. remaining: %s)"
                % (duration_str, est_remaining_str,)
            )

        self.stdscr.addstr(
            0, 0,
            status,
            curses.A_BOLD,
        )

        max_len = max([len(t) for t in self.tables.keys()])

        left_margin = 5
        middle_space = 1

        items = self.tables.items()
        items.sort(
            key=lambda i: (i[1]["perc"], i[0]),
        )

        for i, (table, data) in enumerate(items):
            if i + 2 >= rows:
                break

            perc = data["perc"]

            color = curses.color_pair(2) if perc == 100 else curses.color_pair(1)

            self.stdscr.addstr(
                i+2, left_margin + max_len - len(table),
                table,
                curses.A_BOLD | color,
            )

            size = 20

            progress = "[%s%s]" % (
                "#" * int(perc*size/100),
                " " * (size - int(perc*size/100)),
            )

            self.stdscr.addstr(
                i+2, left_margin + max_len + middle_space,
                "%s %3d%% (%d/%d)" % (progress, perc, data["num_done"], data["total"]),
            )

        if self.finished:
            self.stdscr.addstr(
                rows-1, 0,
                "Press any key to exit...",
            )

        self.stdscr.refresh()
        self.last_update = time.time()

    def done(self):
        self.finished = True
        self.render(True)
        self.stdscr.getch()

    def set_state(self, state):
        self.stdscr.clear()
        self.stdscr.addstr(
            0, 0,
            state + "...",
            curses.A_BOLD,
        )
        self.stdscr.refresh()


class TerminalProgress(Progress):
    """Just prints progress to the terminal
    """
    def update(self, table, num_done):
        super(TerminalProgress, self).update(table, num_done)

        data = self.tables[table]

        print "%s: %d%% (%d/%d)" % (
            table, data["perc"],
            data["num_done"], data["total"],
        )

    def set_state(self, state):
        print state + "..."


##############################################
##############################################


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="A script to port an existing synapse SQLite database to"
                    " a new PostgreSQL database."
    )
    parser.add_argument("-v", action='store_true')
    parser.add_argument(
        "--sqlite-database", required=True,
        help="The snapshot of the SQLite database file. This must not be"
             " currently used by a running synapse server"
    )
    parser.add_argument(
        "--postgres-config", type=argparse.FileType('r'), required=True,
        help="The database config file for the PostgreSQL database"
    )
    parser.add_argument(
        "--curses", action='store_true',
        help="display a curses based progress UI"
    )

    parser.add_argument(
        "--batch-size", type=int, default=1000,
        help="The number of rows to select from the SQLite table each"
             " iteration [default=1000]",
    )

    args = parser.parse_args()

    logging_config = {
        "level": logging.DEBUG if args.v else logging.INFO,
        "format": "%(asctime)s - %(name)s - %(lineno)d - %(levelname)s - %(message)s"
    }

    if args.curses:
        logging_config["filename"] = "port-synapse.log"

    logging.basicConfig(**logging_config)

    sqlite_config = {
        "name": "sqlite3",
        "args": {
            "database": args.sqlite_database,
            "cp_min": 1,
            "cp_max": 1,
            "check_same_thread": False,
        },
    }

    postgres_config = yaml.safe_load(args.postgres_config)

    if "database" in postgres_config:
        postgres_config = postgres_config["database"]

    if "name" not in postgres_config:
        sys.stderr.write("Malformed database config: no 'name'")
        sys.exit(2)
    if postgres_config["name"] != "psycopg2":
        sys.stderr.write("Database must use 'psycopg2' connector.")
        sys.exit(3)

    def start(stdscr=None):
        if stdscr:
            progress = CursesProgress(stdscr)
        else:
            progress = TerminalProgress()

        porter = Porter(
            sqlite_config=sqlite_config,
            postgres_config=postgres_config,
            progress=progress,
            batch_size=args.batch_size,
        )

        reactor.callWhenRunning(porter.run)

        reactor.run()

    if args.curses:
        curses.wrapper(start)
    else:
        start()

    if end_error_exec_info:
        exc_type, exc_value, exc_traceback = end_error_exec_info
        traceback.print_exception(exc_type, exc_value, exc_traceback)