Copying Databases in Node

Yes, there are tons of great, StackOverflow-accepted ways to copy giant databases betweens servers. But, sometimes you have a special need for a project that requires selectively copying subsets of data instead of an entire 15GB database.

So, when you have to programmatically copy rows between databases in Node…

Problems with LIMIT

When I first attempted this, I ran into several problems:

  • I didn’t use a LIMIT, and ran out of memory.
  • I specified a LIMIT, but the bulk INSERT failed for some reason.
  • I specified a lower LIMIT, but the INSERT performed differently on each table.
  • I specified an arbitrarily low LIMIT, but on simple tables, it took longer.

I realized there’s no perfect way of doing this, and I surely didn’t want to insert a single row at a time.

Group INSERTs by Size, not Rows

First, let’s use the following tools:

The premise is to copy N rows that roughly equate to M amount of memory.

1. Setup dependencies

var knex        = require('knex');
var Promise     = require('bluebird');
var ProgressBar = require('progress');

var local   = knex.initialize(...);
var remote  = knex.initialize(...);

2. Get table stats

var getStats = function(table) {
  // Query total number of rows (`information_schema` is just an estimate)
  return remote(table).count('*').then(function(response) {
    stats.totalRows = response.shift()['count(*)'];
  })
  // Request DB estimates for table size (approximate)
  .then(function() {
    return remote('information_schema.TABLES').select('data_length', 'index_length').where({
      table_name:   table,
      table_schema: this.remote.client.connectionSettings.database,
    });
  })
  // Calculate stats for # of rows & iterations to stay within memory limits
  .then(function(result) {
    // Base variables for insertion chunk calculations
    stats.memoryLimit = 25 * 1024 * 1024; // 25Mb
    stats.dataLength  = result[0]['data_length'];
    stats.indexLength = result[0]['index_length'];
    stats.totalLength = stats.dataLength + stats.indexLength;
    stats.rowLimit    = Math.min(stats.totalRows, Math.ceil(stats.totalRows * stats.memoryLimit / stats.totalLength));

    return stats;
  });
};

3. Copy table in groups

Now that we have a lot of stats based around a 25Mb memory limit, now we can copy the table:

var copyTable = function(table) {
  return getTableStats(table).then(function(stats) {
    // Empty array to make it easy to iterate for insertion group
    var iterations = new Array(Math.ceil(stats.totalRows / stats.rowLimit));

    // Promise.reduce ensures iterations are done sequentially, whereas
    // Promise.all/map runs in parallel
    return Promise.reduce(iterations, function(previous, iteration, i) {
      var rowOffset = stats.rowLimit * i;

      return copyRows(table, rowOffset, stats.rowLimit, stats.totalRows);
    }, []);
  });

4. Copy individual rows

All that our copyTable function is missing is copyRows:

var copyRows = function(table, offset, limit, total) {
  return remote(table).select('*').offset(offset).limit(limit).then(function(rows) {
    // "[DATABASE] Inserting records N-M of O into TABLE"
    var msg = [
      '[' + chalk.yellow(local.client.connectionSettings.database) + ']',
      'Inserting records',
      chalk.green(offset + 1) + '-' + chalk.green(offset + rows.length),
      'of',
      chalk.green(total),
      'into',
      chalk.yellow(table),
      '[:bar] :percent :etas',
    ].join(' ');

    var bar = new Progress(msg, {
      total: rows.length,
      width: 20,
    });

    // Insert each record individually because it's actually faster (!?)
    // and gives us a pretty progress bar.
    return Promise.map(rows, function(row) {
      return local(table).insert(row).then(function() {
        bar.tick();
      });
    });
  });
};

5. Finish

Finally, kick off the whole promise chain:

copyTable('something_huge').then(function() {
  console.log('Success!');
  process.exit(0);
}, function(err) {
  console.error(err);
  process.exit(1);
});

demo