Paver commands for storing and retrieving bokchoy db caches from s3 TE-2353

This commit is contained in:
Jesse Zoldak
2017-12-18 15:37:24 -05:00
parent e98420dd8d
commit 129bd7afdd
4 changed files with 385 additions and 88 deletions

View File

@@ -1,15 +1,38 @@
"""
tasks for controlling the databases used in tests
Tasks for controlling the databases used in tests
"""
from __future__ import print_function
import os
import hashlib
from paver.easy import sh, needs
from paver.easy import needs
from pavelib.utils.db_utils import (
remove_files_from_folder, apply_migrations, compute_fingerprint_and_write_to_disk,
fingerprint_bokchoy_db_files, does_fingerprint_on_disk_match, is_fingerprint_in_bucket,
get_file_from_s3, extract_files_from_zip, create_tarfile_from_db_cache, upload_to_s3
)
from pavelib.utils.passthrough_opts import PassthroughTask
from pavelib.utils.timer import timed
from pavelib.utils.envs import Env
# Bokchoy db schema and data fixtures
BOKCHOY_DB_FILES = [
'bok_choy_data_default.json',
'bok_choy_data_student_module_history.json',
'bok_choy_migrations_data_default.sql',
'bok_choy_migrations_data_student_module_history.sql',
'bok_choy_schema_default.sql',
'bok_choy_schema_student_module_history.sql'
]
# Output files from scripts/calculate-bokchoy-migrations.sh
MIGRATION_OUTPUT_FILES = [
'bok_choy_default_migrations.yaml',
'bok_choy_student_module_history_migrations.yaml'
]
ALL_DB_FILES = BOKCHOY_DB_FILES + MIGRATION_OUTPUT_FILES
CACHE_BUCKET_NAME = 'edx-tools-database-caches'
CACHE_FOLDER = 'common/test/db_cache'
@needs('pavelib.prereqs.install_prereqs')
@@ -17,88 +40,96 @@ from pavelib.utils.envs import Env
@timed
def update_bokchoy_db_cache():
"""
Update and cache the MYSQL database for bokchoy testing. This command
will remove any previously cached database files and apply migrations
on a fresh db. Additionally, the collective sha1 checksum for all of
these files will be written to file, for future comparisons/checking
for updates.
Update and cache the MYSQL database for bokchoy testing:
* Remove any previously cached database files
* Apply migrations on a fresh db
* Write the collective sha1 checksum for all of these files to disk
You can commit the resulting files in common/test/db_cache into
git to speed up test runs
WARNING: This will take several minutes.
"""
bokchoy_db_files = [
'bok_choy_data_default.json',
'bok_choy_data_student_module_history.json',
'bok_choy_migrations_data_default.sql',
'bok_choy_migrations_data_student_module_history.sql',
'bok_choy_schema_default.sql',
'bok_choy_schema_student_module_history.sql'
]
print('Removing cached db files for bokchoy tests')
for db_file in bokchoy_db_files:
try:
db_file_path = os.path.join(
'{}/common/test/db_cache'.format(Env.REPO_ROOT), db_file
)
os.remove(db_file_path)
print('\tRemoved {}'.format(db_file_path))
except OSError:
continue
sh('{}/scripts/reset-test-db.sh'.format(Env.REPO_ROOT))
# Write the fingerprint of the database files to disk for use in future
# comparisons
fingerprint = fingerprint_bokchoy_db_files()
with open('common/test/db_cache/bokchoy_migrations.sha1', 'w') as fingerprint_file:
fingerprint_file.write(fingerprint)
remove_files_from_folder(BOKCHOY_DB_FILES, CACHE_FOLDER)
apply_migrations(BOKCHOY_DB_FILES, update_cache_files=True)
compute_fingerprint_and_write_to_disk(MIGRATION_OUTPUT_FILES, ALL_DB_FILES)
def compare_bokchoy_db_fingerprints():
@needs('pavelib.prereqs.install_prereqs')
@PassthroughTask
@timed
def update_local_bokchoy_db_from_s3():
"""
Determine if the current state of the bokchoy databases and related files
have changed since the last time they were updated in the repository by
comparing their fingerprint to the fingerprint saved in the repo
Update the MYSQL database for bokchoy testing:
* Determine if your current cache files are up to date
with all the migrations
* If not then check if there is a copy up at s3
* If so then download then extract it
* Otherwise apply migrations as usual
"""
try:
fingerprint_filepath = '{}/common/test/db_cache/bokchoy_migrations.sha1'.format(Env.REPO_ROOT)
with open(fingerprint_filepath, 'r') as fingerprint_file:
cached_fingerprint = fingerprint_file.read().strip()
except IOError:
return False
current_fingerprint = fingerprint_bokchoy_db_files()
return current_fingerprint == cached_fingerprint
fingerprint = fingerprint_bokchoy_db_files(MIGRATION_OUTPUT_FILES, ALL_DB_FILES)
if does_fingerprint_on_disk_match(fingerprint):
print ("DB cache files match the current migrations.")
# TODO: we don't really need to apply migrations, just to
# load the db cache files into the database.
apply_migrations(BOKCHOY_DB_FILES, update_cache_files=False)
elif is_fingerprint_in_bucket(fingerprint, CACHE_BUCKET_NAME):
print ("Found updated bokchoy db files at S3.")
refresh_bokchoy_db_cache_from_s3(fingerprint=fingerprint)
# TODO: we don't really need to apply migrations, just to
# load the db cache files into the database.
apply_migrations(BOKCHOY_DB_FILES, update_cache_files=False)
# Write the new fingerprint to disk so that it reflects the
# current state of the system.
compute_fingerprint_and_write_to_disk(MIGRATION_OUTPUT_FILES, ALL_DB_FILES)
else:
msg = "{} {} {}".format(
"Did not find updated bokchoy db files at S3.",
"Loading the bokchoy db files from disk",
"and running migrations."
)
print (msg)
apply_migrations(BOKCHOY_DB_FILES, update_cache_files=True)
# Write the new fingerprint to disk so that it reflects the
# current state of the system.
# E.g. you could have added a new migration in your PR.
compute_fingerprint_and_write_to_disk(MIGRATION_OUTPUT_FILES, ALL_DB_FILES)
def fingerprint_bokchoy_db_files():
@needs('pavelib.prereqs.install_prereqs')
@PassthroughTask
@timed
def refresh_bokchoy_db_cache_from_s3(fingerprint=None):
"""
Generate a sha1 checksum for files used to configure the bokchoy databases.
This checksum will represent the current 'state' of the databases,
including schema, migrations to be run and data. It can be used to determine
if the databases need to be updated.
If the cache files for the current fingerprint exist
in s3 then replace what you have on disk with those.
If no copy exists on s3 then continue without error.
"""
# Run the calculate-bokchoy-migrations script, which will generate two
# yml files. These tell whether or not we need to run migrations
sh('{}/scripts/calculate-bokchoy-migrations.sh'.format(Env.REPO_ROOT))
db_files = [
# Bokchoy db schema and data fixtures
'bok_choy_data_default.json',
'bok_choy_data_student_module_history.json',
'bok_choy_migrations_data_default.sql',
'bok_choy_migrations_data_student_module_history.sql',
'bok_choy_schema_default.sql',
'bok_choy_schema_student_module_history.sql',
# Output files from scripts/calculate-bokchoy-migrations.sh
'bok_choy_default_migrations.yaml',
'bok_choy_student_module_history_migrations.yaml'
]
hasher = hashlib.sha1()
file_paths = [
os.path.join('common/test/db_cache', db_file) for db_file in db_files
]
for file_path in file_paths:
with open(file_path, 'rb') as file_handle:
hasher.update(file_handle.read())
fingerprint = hasher.hexdigest()
print("Computed fingerprint for bokchoy db files: {}".format(fingerprint))
return fingerprint
if not fingerprint:
fingerprint = fingerprint_bokchoy_db_files(MIGRATION_OUTPUT_FILES, ALL_DB_FILES)
bucket_name = CACHE_BUCKET_NAME
path = CACHE_FOLDER
if is_fingerprint_in_bucket(fingerprint, bucket_name):
zipfile_name = '{}.tar.gz'.format(fingerprint)
get_file_from_s3(bucket_name, zipfile_name, path)
zipfile_path = os.path.join(path, zipfile_name)
print ("Extracting db cache files.")
extract_files_from_zip(BOKCHOY_DB_FILES, zipfile_path, path)
os.remove(zipfile_path)
@needs('pavelib.prereqs.install_prereqs')
@PassthroughTask
@timed
def upload_db_cache_to_s3():
"""
Update the S3 bucket with the bokchoy DB cache files.
"""
fingerprint = fingerprint_bokchoy_db_files(MIGRATION_OUTPUT_FILES, ALL_DB_FILES)
zipfile_name, zipfile_path = create_tarfile_from_db_cache(
fingerprint, BOKCHOY_DB_FILES, CACHE_FOLDER
)
upload_to_s3(zipfile_name, zipfile_path, CACHE_BUCKET_NAME)

View File

@@ -0,0 +1,61 @@
"""
Tests for the Paver commands for updating test databases and its utility methods
"""
import shutil
import tarfile
from tempfile import mkdtemp
import os
from unittest import TestCase
import boto
from mock import patch
from common.test.utils import MockS3Mixin
from pavelib.utils.db_utils import is_fingerprint_in_bucket, extract_files_from_zip
class TestPaverDbS3Utils(MockS3Mixin, TestCase):
""" Tests for paver bokchoy database utils related to s3 """
def setUp(self):
super(TestPaverDbS3Utils, self).setUp()
conn = boto.connect_s3()
conn.create_bucket('moto_test_bucket')
self.bucket = conn.get_bucket('moto_test_bucket')
def test_fingerprint_in_bucket(self):
key = boto.s3.key.Key(bucket=self.bucket, name='testfile.tar.gz')
key.set_contents_from_string('this is a test')
self.assertTrue(is_fingerprint_in_bucket('testfile', 'moto_test_bucket'))
def test_fingerprint_not_in_bucket(self):
key = boto.s3.key.Key(bucket=self.bucket, name='testfile.tar.gz')
key.set_contents_from_string('this is a test')
self.assertFalse(is_fingerprint_in_bucket('otherfile', 'moto_test_bucket'))
class TestPaverDbUtils(TestCase):
""" Tests for paver bokchoy database utils """
@patch('pavelib.utils.db_utils.verify_files_exist')
def test_extract_files_from_zip(self, _mock_verify):
test_dir = mkdtemp()
output_dir = mkdtemp()
self.addCleanup(shutil.rmtree, test_dir)
self.addCleanup(shutil.rmtree, output_dir)
tmp_file_name = os.path.join(test_dir, 'test.txt')
with open(tmp_file_name, 'w') as tmp_file:
tmp_file.write('Test file content')
tmp_tarfile = os.path.join(test_dir, 'test.tar.gz')
with tarfile.open(name=tmp_tarfile, mode='w:gz') as tar_file:
tar_file.add(tmp_file_name, arcname='test.txt')
extract_files_from_zip(['test.txt'], tmp_tarfile, output_dir)
extracted_file = os.path.join(output_dir, 'test.txt')
assert os.path.isfile(extracted_file)
with open(extracted_file, 'r') as test_file:
data = test_file.read()
assert data == 'Test file content'

203
pavelib/utils/db_utils.py Normal file
View File

@@ -0,0 +1,203 @@
"""
Utility methods for bokchoy database manipulation.
"""
from __future__ import print_function
import os
import tarfile
import boto
from paver.easy import BuildFailure, sh
from pavelib.prereqs import compute_fingerprint
from pavelib.utils.envs import Env
CACHE_FOLDER = 'common/test/db_cache'
FINGERPRINT_FILEPATH = '{}/{}/bokchoy_migrations.sha1'.format(Env.REPO_ROOT, CACHE_FOLDER)
def remove_files_from_folder(files, folder):
"""
Remove the specified files from the folder.
Catch any errors as nonfatal.
"""
for file_name in files:
file_with_path = os.path.join(folder, file_name)
try:
os.remove(file_with_path)
print('\tRemoved {}'.format(file_with_path))
except OSError:
print('\tCould not remove {}. Continuing.'.format(file_with_path))
continue
def apply_migrations(db_cache_files, update_cache_files=True):
"""
Apply migrations to the test database.
The called script will flush your db (or create it if it doesn't yet
exist), load in the db cache files files if they exist on disk,
apply migrations, and then optionally write up-to-date cache files.
"""
print ("Applying migrations.")
cmd = '{}/scripts/reset-test-db.sh'.format(Env.REPO_ROOT)
if update_cache_files:
cmd = '{} --rebuild_cache'.format(cmd)
sh(cmd)
verify_files_exist(db_cache_files)
def compute_fingerprint_and_write_to_disk(migration_output_files, all_db_files):
"""
Write the fingerprint for the bok choy migrations state to disk.
"""
fingerprint = fingerprint_bokchoy_db_files(migration_output_files, all_db_files)
write_fingerprint_to_file(fingerprint)
return fingerprint
def fingerprint_bokchoy_db_files(migration_output_files, all_db_files):
"""
Generate a sha1 checksum for files used to configure the bokchoy
databases. This checksum will represent the current 'state' of
the databases, including schema and data, as well as the yaml files
that contain information about all the migrations.
It can be used to determine if migrations need to be run after
loading the schema and data.
"""
calculate_bokchoy_migrations(migration_output_files)
msg = "Verifying that all files needed to compute the fingerprint exist."
print(msg)
verify_files_exist(all_db_files)
file_paths = [
os.path.join(CACHE_FOLDER, db_file) for db_file in all_db_files
]
msg = "Computing the fingerprint."
print(msg)
fingerprint = compute_fingerprint(file_paths)
print("The fingerprint for bokchoy db files is: {}".format(fingerprint))
return fingerprint
def write_fingerprint_to_file(fingerprint):
"""
Write the fingerprint of the database files to disk for use
in future comparisons. This file gets checked into the repo
along with the files.
"""
with open(FINGERPRINT_FILEPATH, 'w') as fingerprint_file:
fingerprint_file.write(fingerprint)
def verify_files_exist(files):
"""
Verify that the files were created.
This will us help notice/prevent breakages due to
changes to the bash script file.
"""
for file_name in files:
file_path = os.path.join(CACHE_FOLDER, file_name)
if not os.path.isfile(file_path):
msg = "Did not find expected file: {}".format(file_path)
raise BuildFailure(msg)
def calculate_bokchoy_migrations(migration_output_files):
"""
Run the calculate-bokchoy-migrations script, which will generate two
yml files. These will tell us whether or not we need to run migrations.
NOTE: the script first clears out the database, then calculates
what migrations need to be run, which is all of them.
"""
sh('{}/scripts/calculate-bokchoy-migrations.sh'.format(Env.REPO_ROOT))
verify_files_exist(migration_output_files)
def does_fingerprint_on_disk_match(fingerprint):
"""
Determine if the fingerprint for the bokchoy database cache files
that was written to disk matches the one specified.
"""
cache_fingerprint = get_bokchoy_db_fingerprint_from_file()
return fingerprint == cache_fingerprint
def is_fingerprint_in_bucket(fingerprint, bucket_name):
"""
Test if a zip file matching the given fingerprint is present within an s3 bucket
"""
zipfile_name = '{}.tar.gz'.format(fingerprint)
conn = boto.connect_s3()
bucket = conn.get_bucket(bucket_name)
key = boto.s3.key.Key(bucket=bucket, name=zipfile_name)
return key.exists()
def get_bokchoy_db_fingerprint_from_file():
"""
Return the value recorded in the fingerprint file.
"""
try:
with open(FINGERPRINT_FILEPATH, 'r') as fingerprint_file:
cached_fingerprint = fingerprint_file.read().strip()
except IOError:
return None
return cached_fingerprint
def get_file_from_s3(bucket_name, zipfile_name, path):
"""
Get the file from s3 and save it to disk.
"""
print ("Retrieving {} from bucket {}.".format(zipfile_name, bucket_name))
conn = boto.connect_s3()
bucket = conn.get_bucket(bucket_name)
key = boto.s3.key.Key(bucket=bucket, name=zipfile_name)
if not key.exists():
msg = "Did not find expected file {} in the S3 bucket {}".format(
zipfile_name, bucket_name
)
raise BuildFailure(msg)
zipfile_path = os.path.join(path, zipfile_name)
key.get_contents_to_filename(zipfile_path)
def extract_files_from_zip(files, zipfile_path, to_path):
"""
Extract files from a zip.
"""
with tarfile.open(name=zipfile_path, mode='r') as tar_file:
for file_name in files:
tar_file.extract(file_name, path=to_path)
verify_files_exist(files)
def create_tarfile_from_db_cache(fingerprint, files, path):
"""
Create a tar.gz file with the current bokchoy DB cache files.
"""
zipfile_name = '{}.tar.gz'.format(fingerprint)
zipfile_path = os.path.join(path, zipfile_name)
with tarfile.open(name=zipfile_path, mode='w:gz') as tar_file:
for name in files:
tar_file.add(os.path.join(path, name), arcname=name)
return zipfile_name, zipfile_path
def upload_to_s3(file_name, file_path, bucket_name):
"""
Upload the specified files to an s3 bucket.
"""
print ("Uploading {} to s3 bucket {}".format(file_name, bucket_name))
conn = boto.connect_s3()
bucket = conn.get_bucket(bucket_name)
key = boto.s3.key.Key(bucket=bucket, name=file_name)
bytes_written = key.set_contents_from_filename(file_path, replace=False)
if bytes_written:
msg = "Wrote {} bytes to {}.".format(bytes_written, key.name)
else:
msg = "File {} already existed in bucket {}.".format(key.name, bucket_name)
print (msg)

View File

@@ -18,22 +18,24 @@ else
SETTINGS="bok_choy_docker"
fi
declare -A databases
declare -a database_order
databases=(["default"]="edxtest" ["student_module_history"]="student_module_history_test")
database_order=("default" "student_module_history")
for db in "${database_order[@]}"; do
echo "CREATE DATABASE IF NOT EXISTS ${databases[$db]};" | mysql $MYSQL_HOST -u root
# Use a different database than the one used for testing,
# because we will need to empty out the database to calculate
# the migrations fingerprint.
# Choosing an arbitrary name "calculate_migrations" for the db.
echo "DROP DATABASE IF EXISTS calculate_migrations;" | mysql $MYSQL_HOST -u root
echo "CREATE DATABASE calculate_migrations;" | mysql $MYSQL_HOST -u root
# Clear out the test database using the reset_db command which uses "DROP DATABASE" and
# "CREATE DATABASE". This will result in an empty database.
echo "Clearing out the $db bok_choy MySQL database."
./manage.py lms --settings $SETTINGS reset_db --traceback --router $db
# Now output all the migrations in the platform to a file.
echo "Calculating migrations."
echo "Calculating migrations for fingerprinting."
output_file="common/test/db_cache/bok_choy_${db}_migrations.yaml"
./manage.py lms --settings $SETTINGS show_unapplied_migrations --database $db --output_file $output_file
# Redirect stdout to /dev/null because the script will print
# out all migrations to both stdout and the output file.
./manage.py lms --settings $SETTINGS show_unapplied_migrations --database $db --output_file $output_file 1>/dev/null
done
echo "DROP DATABASE IF EXISTS calculate_migrations;" | mysql $MYSQL_HOST -u root