Copying Databases in Node

Yes, there are tons of great, Stack Overflow ways to copy giant databases between servers. But, sometimes you have a special need for a project that requires selectively copying subsets of data instead of an entire 15 GB database.

So, when you have to programmatically copy rows between databases in Node…


Problems with LIMIT

When I first attempted this, I ran into several problems:

  • I didn’t use a LIMIT, and ran out of memory.
  • I specified a LIMIT, but the bulk INSERT failed for some reason.
  • Likewise, I specified a lower LIMIT, but the INSERT performed differently on each table.
  • I specified an arbitrarily low LIMIT, but on simple tables, it took longer.

I realized there’s no perfect way of doing this, and I surely didn’t want to insert a single row at a time.

Group INSERTs by Size, not Rows

First, let’s use the following tools:

The premise is to copy N rows that roughly equate to M amount of memory.

1. Setup dependencies

var knex = require('knex');
var Promise = require('bluebird');
var ProgressBar = require('progress');

var local = knex.initialize(...);
var remote = knex.initialize(...);

2. Get table stats

var getStats = function(table) {
  // Query total number of rows (`information_schema` is just an estimate)
  return remote(table).count('_').then(function(response) {
    stats.totalRows = response.shift()['count(_)'];
  })
  // Request DB estimates for table size (approximate)
  .then(function() {
    return remote('information*schema.TABLES').select('data_length', 'index_length').where({
      table_name: table,
      table_schema: this.remote.client.connectionSettings.database,
    });
  })
  // Calculate stats for # of rows & iterations to stay within memory limits
  .then(function(result) {
    // Base variables for insertion chunk calculations
    stats.memoryLimit = 25 * 1024 * 1024; // 25Mb
    stats.dataLength = result[0]['data_length'];
    stats.indexLength = result[0]['index_length'];
    stats.totalLength = stats.dataLength + stats.indexLength;
    stats.rowLimit = Math.min(stats.totalRows, Math.ceil(stats.totalRows * stats.memoryLimit / stats.totalLength));

    return stats;
  }})
};

3. Copy table in groups

Now that we have a lot of stats based around a 25Mb memory limit, now we can copy the table:

var copyTable = function (table) {
  return getTableStats(table).then(function (stats) {
    // Empty array to make it easy to iterate for insertion group
    var iterations = new Array(Math.ceil(stats.totalRows / stats.rowLimit))

    // Promise.reduce ensures iterations are done sequentially, whereas
    // Promise.all/map runs in parallel
    return Promise.reduce(
      iterations,
      function (previous, iteration, i) {
        var rowOffset = stats.rowLimit * i

        return copyRows(table, rowOffset, stats.rowLimit, stats.totalRows)
      },
      []
    )
  })
}

4. Copy individual rows

All that our copyTable function is missing is copyRows:

var copyRows = function (table, offset, limit, total) {
  return remote(table)
    .select('*')
    .offset(offset)
    .limit(limit)
    .then(function (rows) {
      // "[DATABASE] Inserting records N-M of O into TABLE"
      var msg = [
        '[' + chalk.yellow(local.client.connectionSettings.database) + ']',
        'Inserting records',
        chalk.green(offset + 1) + '-' + chalk.green(offset + rows.length),
        'of',
        chalk.green(total),
        'into',
        chalk.yellow(table),
        '[:bar] :percent :etas',
      ].join(' ')

      var bar = new Progress(msg, {
        total: rows.length,
        width: 20,
      })

      // Insert each record individually because it's actually faster (!?)
      // and gives us a pretty progress bar.
      return Promise.map(rows, function (row) {
        return local(table)
          .insert(row)
          .then(function () {
            bar.tick()
          })
      })
    })
}

5. Finish

Finally, kick off the whole promise chain:

copyTable('something_huge').then(
  function () {
    console.log('Success!')
    process.exit(0)
  },
  function (err) {
    console.error(err)
    process.exit(1)
  }
)

demo