diff --git a/pavelib/database.py b/pavelib/database.py index 8bf663aa0e..6e379594d2 100644 --- a/pavelib/database.py +++ b/pavelib/database.py @@ -1,15 +1,38 @@ """ -tasks for controlling the databases used in tests +Tasks for controlling the databases used in tests """ from __future__ import print_function import os -import hashlib -from paver.easy import sh, needs +from paver.easy import needs +from pavelib.utils.db_utils import ( + remove_files_from_folder, apply_migrations, compute_fingerprint_and_write_to_disk, + fingerprint_bokchoy_db_files, does_fingerprint_on_disk_match, is_fingerprint_in_bucket, + get_file_from_s3, extract_files_from_zip, create_tarfile_from_db_cache, upload_to_s3 +) from pavelib.utils.passthrough_opts import PassthroughTask from pavelib.utils.timer import timed -from pavelib.utils.envs import Env + +# Bokchoy db schema and data fixtures +BOKCHOY_DB_FILES = [ + 'bok_choy_data_default.json', + 'bok_choy_data_student_module_history.json', + 'bok_choy_migrations_data_default.sql', + 'bok_choy_migrations_data_student_module_history.sql', + 'bok_choy_schema_default.sql', + 'bok_choy_schema_student_module_history.sql' +] + +# Output files from scripts/calculate-bokchoy-migrations.sh +MIGRATION_OUTPUT_FILES = [ + 'bok_choy_default_migrations.yaml', + 'bok_choy_student_module_history_migrations.yaml' +] + +ALL_DB_FILES = BOKCHOY_DB_FILES + MIGRATION_OUTPUT_FILES +CACHE_BUCKET_NAME = 'edx-tools-database-caches' +CACHE_FOLDER = 'common/test/db_cache' @needs('pavelib.prereqs.install_prereqs') @@ -17,88 +40,96 @@ from pavelib.utils.envs import Env @timed def update_bokchoy_db_cache(): """ - Update and cache the MYSQL database for bokchoy testing. This command - will remove any previously cached database files and apply migrations - on a fresh db. Additionally, the collective sha1 checksum for all of - these files will be written to file, for future comparisons/checking - for updates. + Update and cache the MYSQL database for bokchoy testing: + * Remove any previously cached database files + * Apply migrations on a fresh db + * Write the collective sha1 checksum for all of these files to disk - You can commit the resulting files in common/test/db_cache into - git to speed up test runs + WARNING: This will take several minutes. """ - bokchoy_db_files = [ - 'bok_choy_data_default.json', - 'bok_choy_data_student_module_history.json', - 'bok_choy_migrations_data_default.sql', - 'bok_choy_migrations_data_student_module_history.sql', - 'bok_choy_schema_default.sql', - 'bok_choy_schema_student_module_history.sql' - ] print('Removing cached db files for bokchoy tests') - for db_file in bokchoy_db_files: - try: - db_file_path = os.path.join( - '{}/common/test/db_cache'.format(Env.REPO_ROOT), db_file - ) - os.remove(db_file_path) - print('\tRemoved {}'.format(db_file_path)) - except OSError: - continue - - sh('{}/scripts/reset-test-db.sh'.format(Env.REPO_ROOT)) - - # Write the fingerprint of the database files to disk for use in future - # comparisons - fingerprint = fingerprint_bokchoy_db_files() - with open('common/test/db_cache/bokchoy_migrations.sha1', 'w') as fingerprint_file: - fingerprint_file.write(fingerprint) + remove_files_from_folder(BOKCHOY_DB_FILES, CACHE_FOLDER) + apply_migrations(BOKCHOY_DB_FILES, update_cache_files=True) + compute_fingerprint_and_write_to_disk(MIGRATION_OUTPUT_FILES, ALL_DB_FILES) -def compare_bokchoy_db_fingerprints(): +@needs('pavelib.prereqs.install_prereqs') +@PassthroughTask +@timed +def update_local_bokchoy_db_from_s3(): """ - Determine if the current state of the bokchoy databases and related files - have changed since the last time they were updated in the repository by - comparing their fingerprint to the fingerprint saved in the repo + Update the MYSQL database for bokchoy testing: + * Determine if your current cache files are up to date + with all the migrations + * If not then check if there is a copy up at s3 + * If so then download then extract it + * Otherwise apply migrations as usual """ - try: - fingerprint_filepath = '{}/common/test/db_cache/bokchoy_migrations.sha1'.format(Env.REPO_ROOT) - with open(fingerprint_filepath, 'r') as fingerprint_file: - cached_fingerprint = fingerprint_file.read().strip() - except IOError: - return False - current_fingerprint = fingerprint_bokchoy_db_files() - return current_fingerprint == cached_fingerprint + fingerprint = fingerprint_bokchoy_db_files(MIGRATION_OUTPUT_FILES, ALL_DB_FILES) + + if does_fingerprint_on_disk_match(fingerprint): + print ("DB cache files match the current migrations.") + # TODO: we don't really need to apply migrations, just to + # load the db cache files into the database. + apply_migrations(BOKCHOY_DB_FILES, update_cache_files=False) + + elif is_fingerprint_in_bucket(fingerprint, CACHE_BUCKET_NAME): + print ("Found updated bokchoy db files at S3.") + refresh_bokchoy_db_cache_from_s3(fingerprint=fingerprint) + # TODO: we don't really need to apply migrations, just to + # load the db cache files into the database. + apply_migrations(BOKCHOY_DB_FILES, update_cache_files=False) + # Write the new fingerprint to disk so that it reflects the + # current state of the system. + compute_fingerprint_and_write_to_disk(MIGRATION_OUTPUT_FILES, ALL_DB_FILES) + + else: + msg = "{} {} {}".format( + "Did not find updated bokchoy db files at S3.", + "Loading the bokchoy db files from disk", + "and running migrations." + ) + print (msg) + apply_migrations(BOKCHOY_DB_FILES, update_cache_files=True) + # Write the new fingerprint to disk so that it reflects the + # current state of the system. + # E.g. you could have added a new migration in your PR. + compute_fingerprint_and_write_to_disk(MIGRATION_OUTPUT_FILES, ALL_DB_FILES) -def fingerprint_bokchoy_db_files(): +@needs('pavelib.prereqs.install_prereqs') +@PassthroughTask +@timed +def refresh_bokchoy_db_cache_from_s3(fingerprint=None): """ - Generate a sha1 checksum for files used to configure the bokchoy databases. - This checksum will represent the current 'state' of the databases, - including schema, migrations to be run and data. It can be used to determine - if the databases need to be updated. + If the cache files for the current fingerprint exist + in s3 then replace what you have on disk with those. + If no copy exists on s3 then continue without error. """ - # Run the calculate-bokchoy-migrations script, which will generate two - # yml files. These tell whether or not we need to run migrations - sh('{}/scripts/calculate-bokchoy-migrations.sh'.format(Env.REPO_ROOT)) - db_files = [ - # Bokchoy db schema and data fixtures - 'bok_choy_data_default.json', - 'bok_choy_data_student_module_history.json', - 'bok_choy_migrations_data_default.sql', - 'bok_choy_migrations_data_student_module_history.sql', - 'bok_choy_schema_default.sql', - 'bok_choy_schema_student_module_history.sql', - # Output files from scripts/calculate-bokchoy-migrations.sh - 'bok_choy_default_migrations.yaml', - 'bok_choy_student_module_history_migrations.yaml' - ] - hasher = hashlib.sha1() - file_paths = [ - os.path.join('common/test/db_cache', db_file) for db_file in db_files - ] - for file_path in file_paths: - with open(file_path, 'rb') as file_handle: - hasher.update(file_handle.read()) - fingerprint = hasher.hexdigest() - print("Computed fingerprint for bokchoy db files: {}".format(fingerprint)) - return fingerprint + if not fingerprint: + fingerprint = fingerprint_bokchoy_db_files(MIGRATION_OUTPUT_FILES, ALL_DB_FILES) + + bucket_name = CACHE_BUCKET_NAME + path = CACHE_FOLDER + if is_fingerprint_in_bucket(fingerprint, bucket_name): + zipfile_name = '{}.tar.gz'.format(fingerprint) + get_file_from_s3(bucket_name, zipfile_name, path) + + zipfile_path = os.path.join(path, zipfile_name) + print ("Extracting db cache files.") + extract_files_from_zip(BOKCHOY_DB_FILES, zipfile_path, path) + os.remove(zipfile_path) + + +@needs('pavelib.prereqs.install_prereqs') +@PassthroughTask +@timed +def upload_db_cache_to_s3(): + """ + Update the S3 bucket with the bokchoy DB cache files. + """ + fingerprint = fingerprint_bokchoy_db_files(MIGRATION_OUTPUT_FILES, ALL_DB_FILES) + zipfile_name, zipfile_path = create_tarfile_from_db_cache( + fingerprint, BOKCHOY_DB_FILES, CACHE_FOLDER + ) + upload_to_s3(zipfile_name, zipfile_path, CACHE_BUCKET_NAME) diff --git a/pavelib/paver_tests/test_database.py b/pavelib/paver_tests/test_database.py new file mode 100644 index 0000000000..6fdc284990 --- /dev/null +++ b/pavelib/paver_tests/test_database.py @@ -0,0 +1,61 @@ +""" +Tests for the Paver commands for updating test databases and its utility methods +""" +import shutil +import tarfile +from tempfile import mkdtemp +import os +from unittest import TestCase + +import boto +from mock import patch + +from common.test.utils import MockS3Mixin +from pavelib.utils.db_utils import is_fingerprint_in_bucket, extract_files_from_zip + + +class TestPaverDbS3Utils(MockS3Mixin, TestCase): + """ Tests for paver bokchoy database utils related to s3 """ + def setUp(self): + super(TestPaverDbS3Utils, self).setUp() + conn = boto.connect_s3() + conn.create_bucket('moto_test_bucket') + self.bucket = conn.get_bucket('moto_test_bucket') + + def test_fingerprint_in_bucket(self): + key = boto.s3.key.Key(bucket=self.bucket, name='testfile.tar.gz') + key.set_contents_from_string('this is a test') + self.assertTrue(is_fingerprint_in_bucket('testfile', 'moto_test_bucket')) + + def test_fingerprint_not_in_bucket(self): + key = boto.s3.key.Key(bucket=self.bucket, name='testfile.tar.gz') + key.set_contents_from_string('this is a test') + self.assertFalse(is_fingerprint_in_bucket('otherfile', 'moto_test_bucket')) + + +class TestPaverDbUtils(TestCase): + """ Tests for paver bokchoy database utils """ + @patch('pavelib.utils.db_utils.verify_files_exist') + def test_extract_files_from_zip(self, _mock_verify): + test_dir = mkdtemp() + output_dir = mkdtemp() + self.addCleanup(shutil.rmtree, test_dir) + self.addCleanup(shutil.rmtree, output_dir) + + tmp_file_name = os.path.join(test_dir, 'test.txt') + with open(tmp_file_name, 'w') as tmp_file: + tmp_file.write('Test file content') + + tmp_tarfile = os.path.join(test_dir, 'test.tar.gz') + + with tarfile.open(name=tmp_tarfile, mode='w:gz') as tar_file: + tar_file.add(tmp_file_name, arcname='test.txt') + + extract_files_from_zip(['test.txt'], tmp_tarfile, output_dir) + + extracted_file = os.path.join(output_dir, 'test.txt') + assert os.path.isfile(extracted_file) + + with open(extracted_file, 'r') as test_file: + data = test_file.read() + assert data == 'Test file content' diff --git a/pavelib/utils/db_utils.py b/pavelib/utils/db_utils.py new file mode 100644 index 0000000000..34166c1995 --- /dev/null +++ b/pavelib/utils/db_utils.py @@ -0,0 +1,203 @@ +""" +Utility methods for bokchoy database manipulation. +""" +from __future__ import print_function +import os +import tarfile + +import boto +from paver.easy import BuildFailure, sh + +from pavelib.prereqs import compute_fingerprint +from pavelib.utils.envs import Env + +CACHE_FOLDER = 'common/test/db_cache' +FINGERPRINT_FILEPATH = '{}/{}/bokchoy_migrations.sha1'.format(Env.REPO_ROOT, CACHE_FOLDER) + + +def remove_files_from_folder(files, folder): + """ + Remove the specified files from the folder. + Catch any errors as nonfatal. + """ + for file_name in files: + file_with_path = os.path.join(folder, file_name) + try: + os.remove(file_with_path) + print('\tRemoved {}'.format(file_with_path)) + except OSError: + print('\tCould not remove {}. Continuing.'.format(file_with_path)) + continue + + +def apply_migrations(db_cache_files, update_cache_files=True): + """ + Apply migrations to the test database. + + The called script will flush your db (or create it if it doesn't yet + exist), load in the db cache files files if they exist on disk, + apply migrations, and then optionally write up-to-date cache files. + """ + print ("Applying migrations.") + cmd = '{}/scripts/reset-test-db.sh'.format(Env.REPO_ROOT) + if update_cache_files: + cmd = '{} --rebuild_cache'.format(cmd) + sh(cmd) + verify_files_exist(db_cache_files) + + +def compute_fingerprint_and_write_to_disk(migration_output_files, all_db_files): + """ + Write the fingerprint for the bok choy migrations state to disk. + """ + fingerprint = fingerprint_bokchoy_db_files(migration_output_files, all_db_files) + write_fingerprint_to_file(fingerprint) + return fingerprint + + +def fingerprint_bokchoy_db_files(migration_output_files, all_db_files): + """ + Generate a sha1 checksum for files used to configure the bokchoy + databases. This checksum will represent the current 'state' of + the databases, including schema and data, as well as the yaml files + that contain information about all the migrations. + + It can be used to determine if migrations need to be run after + loading the schema and data. + """ + calculate_bokchoy_migrations(migration_output_files) + msg = "Verifying that all files needed to compute the fingerprint exist." + print(msg) + verify_files_exist(all_db_files) + + file_paths = [ + os.path.join(CACHE_FOLDER, db_file) for db_file in all_db_files + ] + msg = "Computing the fingerprint." + print(msg) + fingerprint = compute_fingerprint(file_paths) + print("The fingerprint for bokchoy db files is: {}".format(fingerprint)) + return fingerprint + + +def write_fingerprint_to_file(fingerprint): + """ + Write the fingerprint of the database files to disk for use + in future comparisons. This file gets checked into the repo + along with the files. + """ + with open(FINGERPRINT_FILEPATH, 'w') as fingerprint_file: + fingerprint_file.write(fingerprint) + + +def verify_files_exist(files): + """ + Verify that the files were created. + This will us help notice/prevent breakages due to + changes to the bash script file. + """ + for file_name in files: + file_path = os.path.join(CACHE_FOLDER, file_name) + if not os.path.isfile(file_path): + msg = "Did not find expected file: {}".format(file_path) + raise BuildFailure(msg) + + +def calculate_bokchoy_migrations(migration_output_files): + """ + Run the calculate-bokchoy-migrations script, which will generate two + yml files. These will tell us whether or not we need to run migrations. + + NOTE: the script first clears out the database, then calculates + what migrations need to be run, which is all of them. + """ + sh('{}/scripts/calculate-bokchoy-migrations.sh'.format(Env.REPO_ROOT)) + verify_files_exist(migration_output_files) + + +def does_fingerprint_on_disk_match(fingerprint): + """ + Determine if the fingerprint for the bokchoy database cache files + that was written to disk matches the one specified. + """ + cache_fingerprint = get_bokchoy_db_fingerprint_from_file() + return fingerprint == cache_fingerprint + + +def is_fingerprint_in_bucket(fingerprint, bucket_name): + """ + Test if a zip file matching the given fingerprint is present within an s3 bucket + """ + zipfile_name = '{}.tar.gz'.format(fingerprint) + conn = boto.connect_s3() + bucket = conn.get_bucket(bucket_name) + key = boto.s3.key.Key(bucket=bucket, name=zipfile_name) + return key.exists() + + +def get_bokchoy_db_fingerprint_from_file(): + """ + Return the value recorded in the fingerprint file. + """ + try: + with open(FINGERPRINT_FILEPATH, 'r') as fingerprint_file: + cached_fingerprint = fingerprint_file.read().strip() + except IOError: + return None + return cached_fingerprint + + +def get_file_from_s3(bucket_name, zipfile_name, path): + """ + Get the file from s3 and save it to disk. + """ + print ("Retrieving {} from bucket {}.".format(zipfile_name, bucket_name)) + conn = boto.connect_s3() + bucket = conn.get_bucket(bucket_name) + key = boto.s3.key.Key(bucket=bucket, name=zipfile_name) + if not key.exists(): + msg = "Did not find expected file {} in the S3 bucket {}".format( + zipfile_name, bucket_name + ) + raise BuildFailure(msg) + + zipfile_path = os.path.join(path, zipfile_name) + key.get_contents_to_filename(zipfile_path) + + +def extract_files_from_zip(files, zipfile_path, to_path): + """ + Extract files from a zip. + """ + with tarfile.open(name=zipfile_path, mode='r') as tar_file: + for file_name in files: + tar_file.extract(file_name, path=to_path) + verify_files_exist(files) + + +def create_tarfile_from_db_cache(fingerprint, files, path): + """ + Create a tar.gz file with the current bokchoy DB cache files. + """ + zipfile_name = '{}.tar.gz'.format(fingerprint) + zipfile_path = os.path.join(path, zipfile_name) + with tarfile.open(name=zipfile_path, mode='w:gz') as tar_file: + for name in files: + tar_file.add(os.path.join(path, name), arcname=name) + return zipfile_name, zipfile_path + + +def upload_to_s3(file_name, file_path, bucket_name): + """ + Upload the specified files to an s3 bucket. + """ + print ("Uploading {} to s3 bucket {}".format(file_name, bucket_name)) + conn = boto.connect_s3() + bucket = conn.get_bucket(bucket_name) + key = boto.s3.key.Key(bucket=bucket, name=file_name) + bytes_written = key.set_contents_from_filename(file_path, replace=False) + if bytes_written: + msg = "Wrote {} bytes to {}.".format(bytes_written, key.name) + else: + msg = "File {} already existed in bucket {}.".format(key.name, bucket_name) + print (msg) diff --git a/scripts/calculate-bokchoy-migrations.sh b/scripts/calculate-bokchoy-migrations.sh index f58f0aa3ad..c0fb58fae3 100755 --- a/scripts/calculate-bokchoy-migrations.sh +++ b/scripts/calculate-bokchoy-migrations.sh @@ -18,22 +18,24 @@ else SETTINGS="bok_choy_docker" fi -declare -A databases declare -a database_order -databases=(["default"]="edxtest" ["student_module_history"]="student_module_history_test") database_order=("default" "student_module_history") for db in "${database_order[@]}"; do - echo "CREATE DATABASE IF NOT EXISTS ${databases[$db]};" | mysql $MYSQL_HOST -u root + # Use a different database than the one used for testing, + # because we will need to empty out the database to calculate + # the migrations fingerprint. + # Choosing an arbitrary name "calculate_migrations" for the db. + echo "DROP DATABASE IF EXISTS calculate_migrations;" | mysql $MYSQL_HOST -u root + echo "CREATE DATABASE calculate_migrations;" | mysql $MYSQL_HOST -u root - # Clear out the test database using the reset_db command which uses "DROP DATABASE" and - # "CREATE DATABASE". This will result in an empty database. - echo "Clearing out the $db bok_choy MySQL database." - ./manage.py lms --settings $SETTINGS reset_db --traceback --router $db # Now output all the migrations in the platform to a file. - echo "Calculating migrations." + echo "Calculating migrations for fingerprinting." output_file="common/test/db_cache/bok_choy_${db}_migrations.yaml" - ./manage.py lms --settings $SETTINGS show_unapplied_migrations --database $db --output_file $output_file - + # Redirect stdout to /dev/null because the script will print + # out all migrations to both stdout and the output file. + ./manage.py lms --settings $SETTINGS show_unapplied_migrations --database $db --output_file $output_file 1>/dev/null done + +echo "DROP DATABASE IF EXISTS calculate_migrations;" | mysql $MYSQL_HOST -u root