Paver commands for storing and retrieving bokchoy db caches from s3 TE-2353

2017-12-18 15:37:24 -05:00
parent e98420dd8d
commit 129bd7afdd
4 changed files with 385 additions and 88 deletions
--- a/pavelib/database.py
+++ b/pavelib/database.py
@@ -1,15 +1,38 @@
 """
-tasks for controlling the databases used in tests
+Tasks for controlling the databases used in tests
 """
 from __future__ import print_function
 import os
-import hashlib

-from paver.easy import sh, needs
+from paver.easy import needs

+from pavelib.utils.db_utils import (
+    remove_files_from_folder, apply_migrations, compute_fingerprint_and_write_to_disk,
+    fingerprint_bokchoy_db_files, does_fingerprint_on_disk_match, is_fingerprint_in_bucket,
+    get_file_from_s3, extract_files_from_zip, create_tarfile_from_db_cache, upload_to_s3
+)
 from pavelib.utils.passthrough_opts import PassthroughTask
 from pavelib.utils.timer import timed
-from pavelib.utils.envs import Env
+
+# Bokchoy db schema and data fixtures
+BOKCHOY_DB_FILES = [
+    'bok_choy_data_default.json',
+    'bok_choy_data_student_module_history.json',
+    'bok_choy_migrations_data_default.sql',
+    'bok_choy_migrations_data_student_module_history.sql',
+    'bok_choy_schema_default.sql',
+    'bok_choy_schema_student_module_history.sql'
+]
+
+# Output files from scripts/calculate-bokchoy-migrations.sh
+MIGRATION_OUTPUT_FILES = [
+    'bok_choy_default_migrations.yaml',
+    'bok_choy_student_module_history_migrations.yaml'
+]
+
+ALL_DB_FILES = BOKCHOY_DB_FILES + MIGRATION_OUTPUT_FILES
+CACHE_BUCKET_NAME = 'edx-tools-database-caches'
+CACHE_FOLDER = 'common/test/db_cache'


@needs('pavelib.prereqs.install_prereqs')
@@ -17,88 +40,96 @@ from pavelib.utils.envs import Env
@timed
 def update_bokchoy_db_cache():
    """
-    Update and cache the MYSQL database for bokchoy testing. This command
-    will remove any previously cached database files and apply migrations
-    on a fresh db. Additionally, the collective sha1 checksum for all of
-    these files will be written to file, for future comparisons/checking
-    for updates.
+    Update and cache the MYSQL database for bokchoy testing:
+    * Remove any previously cached database files
+    * Apply migrations on a fresh db
+    * Write the collective sha1 checksum for all of these files to disk

-    You can commit the resulting files in common/test/db_cache into
-    git to speed up test runs
+    WARNING: This will take several minutes.
    """
-    bokchoy_db_files = [
-        'bok_choy_data_default.json',
-        'bok_choy_data_student_module_history.json',
-        'bok_choy_migrations_data_default.sql',
-        'bok_choy_migrations_data_student_module_history.sql',
-        'bok_choy_schema_default.sql',
-        'bok_choy_schema_student_module_history.sql'
-    ]
    print('Removing cached db files for bokchoy tests')
-    for db_file in bokchoy_db_files:
-        try:
-            db_file_path = os.path.join(
-                '{}/common/test/db_cache'.format(Env.REPO_ROOT), db_file
-            )
-            os.remove(db_file_path)
-            print('\tRemoved {}'.format(db_file_path))
-        except OSError:
-            continue
-
-    sh('{}/scripts/reset-test-db.sh'.format(Env.REPO_ROOT))
-
-    # Write the fingerprint of the database files to disk for use in future
-    # comparisons
-    fingerprint = fingerprint_bokchoy_db_files()
-    with open('common/test/db_cache/bokchoy_migrations.sha1', 'w') as fingerprint_file:
-        fingerprint_file.write(fingerprint)
+    remove_files_from_folder(BOKCHOY_DB_FILES, CACHE_FOLDER)
+    apply_migrations(BOKCHOY_DB_FILES, update_cache_files=True)
+    compute_fingerprint_and_write_to_disk(MIGRATION_OUTPUT_FILES, ALL_DB_FILES)


-def compare_bokchoy_db_fingerprints():
+@needs('pavelib.prereqs.install_prereqs')
+@PassthroughTask
+@timed
+def update_local_bokchoy_db_from_s3():
    """
-    Determine if the current state of the bokchoy databases and related files
-    have changed since the last time they were updated in the repository by
-    comparing their fingerprint to the fingerprint saved in the repo
+    Update the MYSQL database for bokchoy testing:
+    * Determine if your current cache files are up to date
+      with all the migrations
+    * If not then check if there is a copy up at s3
+    * If so then download then extract it
+    * Otherwise apply migrations as usual
    """
-    try:
-        fingerprint_filepath = '{}/common/test/db_cache/bokchoy_migrations.sha1'.format(Env.REPO_ROOT)
-        with open(fingerprint_filepath, 'r') as fingerprint_file:
-            cached_fingerprint = fingerprint_file.read().strip()
-    except IOError:
-        return False
-    current_fingerprint = fingerprint_bokchoy_db_files()
-    return current_fingerprint == cached_fingerprint
+    fingerprint = fingerprint_bokchoy_db_files(MIGRATION_OUTPUT_FILES, ALL_DB_FILES)
+
+    if does_fingerprint_on_disk_match(fingerprint):
+        print ("DB cache files match the current migrations.")
+        # TODO: we don't really need to apply migrations, just to
+        # load the db cache files into the database.
+        apply_migrations(BOKCHOY_DB_FILES, update_cache_files=False)
+
+    elif is_fingerprint_in_bucket(fingerprint, CACHE_BUCKET_NAME):
+        print ("Found updated bokchoy db files at S3.")
+        refresh_bokchoy_db_cache_from_s3(fingerprint=fingerprint)
+        # TODO: we don't really need to apply migrations, just to
+        # load the db cache files into the database.
+        apply_migrations(BOKCHOY_DB_FILES, update_cache_files=False)
+        # Write the new fingerprint to disk so that it reflects the
+        # current state of the system.
+        compute_fingerprint_and_write_to_disk(MIGRATION_OUTPUT_FILES, ALL_DB_FILES)
+
+    else:
+        msg = "{} {} {}".format(
+            "Did not find updated bokchoy db files at S3.",
+            "Loading the bokchoy db files from disk",
+            "and running migrations."
+        )
+        print (msg)
+        apply_migrations(BOKCHOY_DB_FILES, update_cache_files=True)
+        # Write the new fingerprint to disk so that it reflects the
+        # current state of the system.
+        # E.g. you could have added a new migration in your PR.
+        compute_fingerprint_and_write_to_disk(MIGRATION_OUTPUT_FILES, ALL_DB_FILES)


-def fingerprint_bokchoy_db_files():
+@needs('pavelib.prereqs.install_prereqs')
+@PassthroughTask
+@timed
+def refresh_bokchoy_db_cache_from_s3(fingerprint=None):
    """
-    Generate a sha1 checksum for files used to configure the bokchoy databases.
-    This checksum will represent the current 'state' of the databases,
-    including schema, migrations to be run and data. It can be used to determine
-    if the databases need to be updated.
+    If the cache files for the current fingerprint exist
+    in s3 then replace what you have on disk with those.
+    If no copy exists on s3 then continue without error.
    """
-    # Run the calculate-bokchoy-migrations script, which will generate two
-    # yml files. These tell whether or not we need to run migrations
-    sh('{}/scripts/calculate-bokchoy-migrations.sh'.format(Env.REPO_ROOT))
-    db_files = [
-        # Bokchoy db schema and data fixtures
-        'bok_choy_data_default.json',
-        'bok_choy_data_student_module_history.json',
-        'bok_choy_migrations_data_default.sql',
-        'bok_choy_migrations_data_student_module_history.sql',
-        'bok_choy_schema_default.sql',
-        'bok_choy_schema_student_module_history.sql',
-        # Output files from scripts/calculate-bokchoy-migrations.sh
-        'bok_choy_default_migrations.yaml',
-        'bok_choy_student_module_history_migrations.yaml'
-    ]
-    hasher = hashlib.sha1()
-    file_paths = [
-        os.path.join('common/test/db_cache', db_file) for db_file in db_files
-    ]
-    for file_path in file_paths:
-        with open(file_path, 'rb') as file_handle:
-            hasher.update(file_handle.read())
-    fingerprint = hasher.hexdigest()
-    print("Computed fingerprint for bokchoy db files: {}".format(fingerprint))
-    return fingerprint
+    if not fingerprint:
+        fingerprint = fingerprint_bokchoy_db_files(MIGRATION_OUTPUT_FILES, ALL_DB_FILES)
+
+    bucket_name = CACHE_BUCKET_NAME
+    path = CACHE_FOLDER
+    if is_fingerprint_in_bucket(fingerprint, bucket_name):
+        zipfile_name = '{}.tar.gz'.format(fingerprint)
+        get_file_from_s3(bucket_name, zipfile_name, path)
+
+        zipfile_path = os.path.join(path, zipfile_name)
+        print ("Extracting db cache files.")
+        extract_files_from_zip(BOKCHOY_DB_FILES, zipfile_path, path)
+        os.remove(zipfile_path)
+
+
+@needs('pavelib.prereqs.install_prereqs')
+@PassthroughTask
+@timed
+def upload_db_cache_to_s3():
+    """
+    Update the S3 bucket with the bokchoy DB cache files.
+    """
+    fingerprint = fingerprint_bokchoy_db_files(MIGRATION_OUTPUT_FILES, ALL_DB_FILES)
+    zipfile_name, zipfile_path = create_tarfile_from_db_cache(
+        fingerprint, BOKCHOY_DB_FILES, CACHE_FOLDER
+    )
+    upload_to_s3(zipfile_name, zipfile_path, CACHE_BUCKET_NAME)
--- a/pavelib/paver_tests/test_database.py
+++ b/pavelib/paver_tests/test_database.py
@@ -0,0 +1,61 @@
+"""
+Tests for the Paver commands for updating test databases and its utility methods
+"""
+import shutil
+import tarfile
+from tempfile import mkdtemp
+import os
+from unittest import TestCase
+
+import boto
+from mock import patch
+
+from common.test.utils import MockS3Mixin
+from pavelib.utils.db_utils import is_fingerprint_in_bucket, extract_files_from_zip
+
+
+class TestPaverDbS3Utils(MockS3Mixin, TestCase):
+    """ Tests for paver bokchoy database utils related to s3 """
+    def setUp(self):
+        super(TestPaverDbS3Utils, self).setUp()
+        conn = boto.connect_s3()
+        conn.create_bucket('moto_test_bucket')
+        self.bucket = conn.get_bucket('moto_test_bucket')
+
+    def test_fingerprint_in_bucket(self):
+        key = boto.s3.key.Key(bucket=self.bucket, name='testfile.tar.gz')
+        key.set_contents_from_string('this is a test')
+        self.assertTrue(is_fingerprint_in_bucket('testfile', 'moto_test_bucket'))
+
+    def test_fingerprint_not_in_bucket(self):
+        key = boto.s3.key.Key(bucket=self.bucket, name='testfile.tar.gz')
+        key.set_contents_from_string('this is a test')
+        self.assertFalse(is_fingerprint_in_bucket('otherfile', 'moto_test_bucket'))
+
+
+class TestPaverDbUtils(TestCase):
+    """ Tests for paver bokchoy database utils """
+    @patch('pavelib.utils.db_utils.verify_files_exist')
+    def test_extract_files_from_zip(self, _mock_verify):
+        test_dir = mkdtemp()
+        output_dir = mkdtemp()
+        self.addCleanup(shutil.rmtree, test_dir)
+        self.addCleanup(shutil.rmtree, output_dir)
+
+        tmp_file_name = os.path.join(test_dir, 'test.txt')
+        with open(tmp_file_name, 'w') as tmp_file:
+            tmp_file.write('Test file content')
+
+        tmp_tarfile = os.path.join(test_dir, 'test.tar.gz')
+
+        with tarfile.open(name=tmp_tarfile, mode='w:gz') as tar_file:
+            tar_file.add(tmp_file_name, arcname='test.txt')
+
+        extract_files_from_zip(['test.txt'], tmp_tarfile, output_dir)
+
+        extracted_file = os.path.join(output_dir, 'test.txt')
+        assert os.path.isfile(extracted_file)
+
+        with open(extracted_file, 'r') as test_file:
+            data = test_file.read()
+        assert data == 'Test file content'
--- a/pavelib/utils/db_utils.py
+++ b/pavelib/utils/db_utils.py
@@ -0,0 +1,203 @@
+"""
+Utility methods for bokchoy database manipulation.
+"""
+from __future__ import print_function
+import os
+import tarfile
+
+import boto
+from paver.easy import BuildFailure, sh
+
+from pavelib.prereqs import compute_fingerprint
+from pavelib.utils.envs import Env
+
+CACHE_FOLDER = 'common/test/db_cache'
+FINGERPRINT_FILEPATH = '{}/{}/bokchoy_migrations.sha1'.format(Env.REPO_ROOT, CACHE_FOLDER)
+
+
+def remove_files_from_folder(files, folder):
+    """
+    Remove the specified files from the folder.
+    Catch any errors as nonfatal.
+    """
+    for file_name in files:
+        file_with_path = os.path.join(folder, file_name)
+        try:
+            os.remove(file_with_path)
+            print('\tRemoved {}'.format(file_with_path))
+        except OSError:
+            print('\tCould not remove {}. Continuing.'.format(file_with_path))
+            continue
+
+
+def apply_migrations(db_cache_files, update_cache_files=True):
+    """
+    Apply migrations to the test database.
+
+    The called script will flush your db (or create it if it doesn't yet
+    exist), load in the db cache files files if they exist on disk,
+    apply migrations, and then optionally write up-to-date cache files.
+    """
+    print ("Applying migrations.")
+    cmd = '{}/scripts/reset-test-db.sh'.format(Env.REPO_ROOT)
+    if update_cache_files:
+        cmd = '{} --rebuild_cache'.format(cmd)
+    sh(cmd)
+    verify_files_exist(db_cache_files)
+
+
+def compute_fingerprint_and_write_to_disk(migration_output_files, all_db_files):
+    """
+    Write the fingerprint for the bok choy migrations state to disk.
+    """
+    fingerprint = fingerprint_bokchoy_db_files(migration_output_files, all_db_files)
+    write_fingerprint_to_file(fingerprint)
+    return fingerprint
+
+
+def fingerprint_bokchoy_db_files(migration_output_files, all_db_files):
+    """
+    Generate a sha1 checksum for files used to configure the bokchoy
+    databases. This checksum will represent the current 'state' of
+    the databases, including schema and data, as well as the yaml files
+    that contain information about all the migrations.
+
+    It can be used to determine if migrations need to be run after
+    loading the schema and data.
+    """
+    calculate_bokchoy_migrations(migration_output_files)
+    msg = "Verifying that all files needed to compute the fingerprint exist."
+    print(msg)
+    verify_files_exist(all_db_files)
+
+    file_paths = [
+        os.path.join(CACHE_FOLDER, db_file) for db_file in all_db_files
+    ]
+    msg = "Computing the fingerprint."
+    print(msg)
+    fingerprint = compute_fingerprint(file_paths)
+    print("The fingerprint for bokchoy db files is: {}".format(fingerprint))
+    return fingerprint
+
+
+def write_fingerprint_to_file(fingerprint):
+    """
+    Write the fingerprint of the database files to disk for use
+    in future comparisons. This file gets checked into the repo
+    along with the files.
+    """
+    with open(FINGERPRINT_FILEPATH, 'w') as fingerprint_file:
+        fingerprint_file.write(fingerprint)
+
+
+def verify_files_exist(files):
+    """
+    Verify that the files were created.
+    This will us help notice/prevent breakages due to
+    changes to the bash script file.
+    """
+    for file_name in files:
+        file_path = os.path.join(CACHE_FOLDER, file_name)
+        if not os.path.isfile(file_path):
+            msg = "Did not find expected file: {}".format(file_path)
+            raise BuildFailure(msg)
+
+
+def calculate_bokchoy_migrations(migration_output_files):
+    """
+    Run the calculate-bokchoy-migrations script, which will generate two
+    yml files. These will tell us whether or not we need to run migrations.
+
+    NOTE: the script first clears out the database, then calculates
+          what migrations need to be run, which is all of them.
+    """
+    sh('{}/scripts/calculate-bokchoy-migrations.sh'.format(Env.REPO_ROOT))
+    verify_files_exist(migration_output_files)
+
+
+def does_fingerprint_on_disk_match(fingerprint):
+    """
+    Determine if the fingerprint for the bokchoy database cache files
+    that was written to disk matches the one specified.
+    """
+    cache_fingerprint = get_bokchoy_db_fingerprint_from_file()
+    return fingerprint == cache_fingerprint
+
+
+def is_fingerprint_in_bucket(fingerprint, bucket_name):
+    """
+    Test if a zip file matching the given fingerprint is present within an s3 bucket
+    """
+    zipfile_name = '{}.tar.gz'.format(fingerprint)
+    conn = boto.connect_s3()
+    bucket = conn.get_bucket(bucket_name)
+    key = boto.s3.key.Key(bucket=bucket, name=zipfile_name)
+    return key.exists()
+
+
+def get_bokchoy_db_fingerprint_from_file():
+    """
+    Return the value recorded in the fingerprint file.
+    """
+    try:
+        with open(FINGERPRINT_FILEPATH, 'r') as fingerprint_file:
+            cached_fingerprint = fingerprint_file.read().strip()
+    except IOError:
+        return None
+    return cached_fingerprint
+
+
+def get_file_from_s3(bucket_name, zipfile_name, path):
+    """
+    Get the file from s3 and save it to disk.
+    """
+    print ("Retrieving {} from bucket {}.".format(zipfile_name, bucket_name))
+    conn = boto.connect_s3()
+    bucket = conn.get_bucket(bucket_name)
+    key = boto.s3.key.Key(bucket=bucket, name=zipfile_name)
+    if not key.exists():
+        msg = "Did not find expected file {} in the S3 bucket {}".format(
+            zipfile_name, bucket_name
+        )
+        raise BuildFailure(msg)
+
+    zipfile_path = os.path.join(path, zipfile_name)
+    key.get_contents_to_filename(zipfile_path)
+
+
+def extract_files_from_zip(files, zipfile_path, to_path):
+    """
+    Extract files from a zip.
+    """
+    with tarfile.open(name=zipfile_path, mode='r') as tar_file:
+        for file_name in files:
+            tar_file.extract(file_name, path=to_path)
+    verify_files_exist(files)
+
+
+def create_tarfile_from_db_cache(fingerprint, files, path):
+    """
+    Create a tar.gz file with the current bokchoy DB cache files.
+    """
+    zipfile_name = '{}.tar.gz'.format(fingerprint)
+    zipfile_path = os.path.join(path, zipfile_name)
+    with tarfile.open(name=zipfile_path, mode='w:gz') as tar_file:
+        for name in files:
+            tar_file.add(os.path.join(path, name), arcname=name)
+    return zipfile_name, zipfile_path
+
+
+def upload_to_s3(file_name, file_path, bucket_name):
+    """
+    Upload the specified files to an s3 bucket.
+    """
+    print ("Uploading {} to s3 bucket {}".format(file_name, bucket_name))
+    conn = boto.connect_s3()
+    bucket = conn.get_bucket(bucket_name)
+    key = boto.s3.key.Key(bucket=bucket, name=file_name)
+    bytes_written = key.set_contents_from_filename(file_path, replace=False)
+    if bytes_written:
+        msg = "Wrote {} bytes to {}.".format(bytes_written, key.name)
+    else:
+        msg = "File {} already existed in bucket {}.".format(key.name, bucket_name)
+    print (msg)
--- a/scripts/calculate-bokchoy-migrations.sh
+++ b/scripts/calculate-bokchoy-migrations.sh
@@ -18,22 +18,24 @@ else
    SETTINGS="bok_choy_docker"
 fi

-declare -A databases
 declare -a database_order
-databases=(["default"]="edxtest" ["student_module_history"]="student_module_history_test")
 database_order=("default" "student_module_history")

 for db in "${database_order[@]}"; do
-    echo "CREATE DATABASE IF NOT EXISTS ${databases[$db]};" | mysql $MYSQL_HOST -u root
+    # Use a different database than the one used for testing,
+    # because we will need to empty out the database to calculate
+    # the migrations fingerprint.
+    # Choosing an arbitrary name "calculate_migrations" for the db.
+    echo "DROP DATABASE IF EXISTS calculate_migrations;" | mysql $MYSQL_HOST -u root
+    echo "CREATE DATABASE calculate_migrations;" | mysql $MYSQL_HOST -u root

-    # Clear out the test database using the reset_db command which uses "DROP DATABASE" and
-    # "CREATE DATABASE". This will result in an empty database.
-    echo "Clearing out the $db bok_choy MySQL database."
-    ./manage.py lms --settings $SETTINGS reset_db --traceback --router $db
    # Now output all the migrations in the platform to a file.
-    echo "Calculating migrations."
+    echo "Calculating migrations for fingerprinting."

    output_file="common/test/db_cache/bok_choy_${db}_migrations.yaml"
-    ./manage.py lms --settings $SETTINGS show_unapplied_migrations --database $db --output_file $output_file
-
+    # Redirect stdout to /dev/null because the script will print
+    # out all migrations to both stdout and the output file.
+    ./manage.py lms --settings $SETTINGS show_unapplied_migrations --database $db --output_file $output_file 1>/dev/null
 done
+
+echo "DROP DATABASE IF EXISTS calculate_migrations;" | mysql $MYSQL_HOST -u root