#!/usr/bin/env python """ A linting tool to check if templates are safe """ from enum import Enum import os import re import sys _skip_dirs = ( '/node_modules', '/vendor', '/spec', '/.pycharm_helpers', '/test_root', '/reports/diff_quality', '/common/static/xmodule/modules', ) _skip_mako_dirs = _skip_dirs _skip_underscore_dirs = _skip_dirs + ('/test',) def _is_skip_dir(skip_dirs, directory): """ Determines whether a directory should be skipped or linted. Arguments: skip_dirs: The configured directories to be skipped. directory: The current directory to be tested. Returns: True if the directory should be skipped, and False otherwise. """ for skip_dir in skip_dirs: dir_contains_skip_dir = (directory.find(skip_dir + '/') >= 0) if dir_contains_skip_dir or directory.endswith(skip_dir): return True return False def _load_file(self, file_full_path): """ Loads a file into a string. Arguments: file_full_path: The full path of the file to be loaded. Returns: A string containing the files contents. """ with open(file_full_path, 'r') as input_file: file_contents = input_file.read() return file_contents.decode(encoding='utf-8') def _get_line_breaks(self, string): """ Creates a list, where each entry represents the index into the string where the next line break was found. Arguments: string: The string in which to find line breaks. Returns: A list of indices into the string at which each line break can be found. """ line_breaks = [0] index = 0 while True: index = string.find('\n', index) if index < 0: break index += 1 line_breaks.append(index) return line_breaks def _get_line_number(self, line_breaks, index): """ Given the list of line break indices, and an index, determines the line of the index. Arguments: line_breaks: A list of indices into a string at which each line break was found. index: The index into the original string for which we want to know the line number Returns: The line number of the provided index. """ current_line_number = 0 for line_break_index in line_breaks: if line_break_index <= index: current_line_number += 1 else: break return current_line_number def _get_line(self, string, line_breaks, line_number): """ Gets the line of text designated by the provided line number. Arguments: string: The string of content with line breaks. line_breaks: A list of indices into a string at which each line break was found. line_number: The line number of the line we want to find. Returns: The line of text designated by the provided line number. """ start_index = line_breaks[line_number - 1] if len(line_breaks) == line_number: line = string[start_index:] else: end_index = line_breaks[line_number] line = string[start_index:end_index - 1] return line.encode(encoding='utf-8') def _get_column_number(self, line_breaks, line_number, index): """ Gets the column (i.e. index into the line) for the given index into the original string. Arguments: line_breaks: A list of indices into a string at which each line break was found. line_number: The line number of the line we want to find. index: The index into the original string. Returns: The column (i.e. index into the line) for the given index into the original string. """ start_index = line_breaks[line_number - 1] column = index - start_index + 1 return column class Rules(Enum): """ An Enum of each rule which the linter will check. """ mako_missing_default = ('mako-missing-default', 'The default page directive with h filter is missing.') mako_unparsable_expression = ('mako-unparsable-expression', 'The expression could not be properly parsed.') mako_unwanted_html_filter = ('mako-unwanted-html-filter', 'Remove explicit h filters when it is provided by the page directive.') mako_invalid_html_filter = ('mako-invalid-html-filter', 'The expression is using an invalid filter in an HTML context.') mako_invalid_js_filter = ('mako-invalid-js-filter', 'The expression is using an invalid filter in a JavaScript context.') mako_js_string_missing_quotes = ('mako-js-string-missing-quotes', 'An expression using the js_escape_string filter must have surrounding quotes.') underscore_not_escaped = ('underscore-not-escaped', 'Expressions should be escaped using <%- expression %>.') def __init__(self, rule_id, rule_summary): self.rule_id = rule_id self.rule_summary = rule_summary class RuleViolation(object): """ Base class representing a rule violation which can be used for reporting. """ def __init__(self, rule): """ Init method. Arguments: rule: The Rule which was violated. """ self.rule = rule self.full_path = '' def prepare_results(self, full_path, file_string, line_breaks): """ Preps this instance for results reporting. Arguments: full_path: Path of the file in violation. file_string: The contents of the file in violation. line_breaks: A list of indices into file_string at which each line break was found. """ self.full_path = full_path def print_results(self): """ Prints the results represented by this rule violation. """ print "{}: {}".format(self.full_path, self.rule.rule_id) class ExpressionRuleViolation(RuleViolation): """ A class representing a particular rule violation for expressions which contain more specific details of the location of the violation for reporting purposes. """ def __init__(self, rule, expression): """ Init method. Arguments: rule: The Rule which was violated. expression: The expression that was in violation. """ super(ExpressionRuleViolation, self).__init__(rule) self.expression = expression self.start_line = 0 self.start_column = 0 self.end_line = 0 self.end_column = 0 self.lines = [] def prepare_results(self, full_path, file_string, line_breaks): """ Preps this instance for results reporting. Arguments: full_path: Path of the file in violation. file_string: The contents of the file in violation. line_breaks: A list of indices into file_string at which each line break was found. """ self.full_path = full_path start_index = self.expression['start_index'] self.start_line = _get_line_number(self, line_breaks, start_index) self.start_column = _get_column_number(self, line_breaks, self.start_line, start_index) end_index = self.expression['end_index'] if end_index > 0: self.end_line = _get_line_number(self, line_breaks, end_index) self.end_column = _get_column_number(self, line_breaks, self.end_line, end_index) else: self.end_line = self.start_line self.end_column = '?' for line_number in range(self.start_line, self.end_line + 1): self.lines.append(_get_line(self, file_string, line_breaks, line_number)) def print_results(self): """ Prints the results represented by this rule violation. """ for line_number in range(self.start_line, self.end_line + 1): if (line_number == self.start_line): column = self.start_column rule_id = self.rule.rule_id + ":" else: column = 1 rule_id = " " * (len(self.rule.rule_id) + 1) print "{}: {}:{}: {} {}".format( self.full_path, line_number, column, rule_id, self.lines[line_number - self.start_line - 1] ) class FileResults(object): """ Contains the results, or violations, for a file. """ def __init__(self, full_path): """ Init method. Arguments: full_path: The full path for this file. """ self.full_path = full_path self.violations = [] def prepare_results(self, file_string): """ Prepares the results for output for this file. Arguments: file_string: The string of content for this file. """ line_breaks = _get_line_breaks(self, file_string) for violation in self.violations: violation.prepare_results(self.full_path, file_string, line_breaks) def print_results(self, options): """ Prints the results (i.e. violations) in this file. Arguments: options: A list of the following options: is_quiet: True to print only file names, and False to print all violations. """ if options['is_quiet']: print self.full_path else: for violation in self.violations: violation.print_results() class MakoTemplateLinter(object): """ The linter for Mako template files. """ _skip_mako_dirs = _skip_dirs def process_file(self, directory, file_name): """ Process file to determine if it is a Mako template file and if it is safe. Arguments: directory (string): The directory of the file to be checked file_name (string): A filename for a potential Mako file Returns: The file results containing any violations, or None if the file is never checked. """ if not self._is_mako_directory(directory): return None # TODO: When safe-by-default is turned on at the platform level, will we: # 1. Turn it on for .html only, or # 2. Turn it on for all files, and have different rulesets that have # different rules of .xml, .html, .js, .txt Mako templates (e.g. use # the n filter to turn off h for some of these)? # For now, we only check .html and .xml files if not (file_name.lower().endswith('.html') or file_name.lower().endswith('.xml')): return None return self._load_and_check_mako_file_is_safe(directory + '/' + file_name) def _is_mako_directory(self, directory): """ Determines if the provided directory is a directory that could contain Mako template files that need to be linted. Arguments: directory: The directory to be linted. Returns: True if this directory should be linted for Mako template violations and False otherwise. """ if _is_skip_dir(self._skip_mako_dirs, directory): return False if (directory.find('/templates/') >= 0) or directory.endswith('/templates'): return True return False def _load_and_check_mako_file_is_safe(self, mako_file_full_path): """ Loads the Mako template file and checks if it is in violation. Arguments: mako_file_full_path: The file to be loaded and linted. Returns: The file results containing any violations, or None if none found. """ mako_template = _load_file(self, mako_file_full_path) results = FileResults(mako_file_full_path) self._check_mako_file_is_safe(mako_template, results) if len(results.violations) > 0: return results else: return None def _check_mako_file_is_safe(self, mako_template, results): """ Checks for violations in a Mako template. Arguments: mako_template: The contents of the Mako template. results: A file results objects to which violations will be added. """ has_page_default = self._has_page_default(mako_template, results) if not has_page_default: results.violations.append(RuleViolation(Rules.mako_missing_default)) self._check_mako_expressions(mako_template, has_page_default, results) results.prepare_results(mako_template) def _has_page_default(self, mako_template, results): """ Checks if the Mako template contains the page expression marking it as safe by default. Arguments: mako_template: The contents of the Mako template. results: A list of results into which violations will be added. """ page_h_filter_regex = re.compile('<%page expression_filter=(?:"h"|\'h\')\s*/>') page_match = page_h_filter_regex.search(mako_template) return page_match def _check_mako_expressions(self, mako_template, has_page_default, results): """ Searches for Mako expressions and then checks if they contain violations. Arguments: mako_template: The contents of the Mako template. has_page_default: True if the page is marked as default, False otherwise. results: A list of results into which violations will be added. """ expressions = self._find_mako_expressions(mako_template) contexts = self._get_contexts(mako_template) for expression in expressions: if expression['expression'] is None: results.violations.append(ExpressionRuleViolation( Rules.mako_unparsable_expression, expression )) continue context = self._get_context(contexts, expression['start_index']) self._check_filters(mako_template, expression, context, has_page_default, results) def _check_filters(self, mako_template, expression, context, has_page_default, results): """ Checks that the filters used in the given Mako expression are valid for the given context. Arguments: mako_template: The contents of the Mako template. expression: A dict containing the start_index, end_index, and expression (text) of the expression. context: The context of the page in which the expression was found (e.g. javascript, html). has_page_default: True if the page is marked as default, False otherwise. results: A list of results into which violations will be added. """ # finds "| n, h}" when given "${x | n, h}" filters_regex = re.compile('\|[a-zA-Z_,\s]*\}') filters_match = filters_regex.search(expression['expression']) if filters_match is None: if context == 'javascript': results.violations.append(ExpressionRuleViolation( Rules.mako_invalid_js_filter, expression )) return filters = filters_match.group()[1:-1].replace(" ", "").split(",") if context == 'html': if (len(filters) == 1) and (filters[0] == 'h'): if has_page_default: # suppress this violation if the page default hasn't been set, # otherwise the template might get less safe results.violations.append(ExpressionRuleViolation( Rules.mako_unwanted_html_filter, expression )) elif (len(filters) == 2) and (filters[0] == 'n') and (filters[1] == 'dump_html_escaped_json'): # {x | n, dump_html_escaped_json} is valid pass else: results.violations.append(ExpressionRuleViolation( Rules.mako_invalid_html_filter, expression )) else: if (len(filters) == 2) and (filters[0] == 'n') and (filters[1] == 'dump_js_escaped_json'): # {x | n, dump_js_escaped_json} is valid pass elif (len(filters) == 2) and (filters[0] == 'n') and (filters[1] == 'js_escaped_string'): # {x | n, js_escaped_string} is valid, if surrounded by quotes prior_character = mako_template[expression['start_index'] - 1] next_character = mako_template[expression['end_index'] + 1] has_surrounding_quotes = (prior_character == '\'' and next_character == '\'') or \ (prior_character == '"' and next_character == '"') if not has_surrounding_quotes: results.violations.append(ExpressionRuleViolation( Rules.mako_js_string_missing_quotes, expression )) else: results.violations.append(ExpressionRuleViolation( Rules.mako_invalid_js_filter, expression )) def _get_contexts(self, mako_template): """ Returns a data structure that represents the indices at which the template changes from HTML context to JavaScript and back. Return: A list of dicts where each dict contains the 'index' of the context and the context 'type' (e.g. 'html' or 'javascript'). """ contexts_re = re.compile(r""" | # script tag start | # script tag end <%static:require_module.*?>| # require js script tag start # require js script tag end""", re.VERBOSE + re.IGNORECASE) media_type_re = re.compile(r"""type=['"].*?['"]""", re.IGNORECASE) contexts = [{'index': 0, 'type': 'html'}] for context in contexts_re.finditer(mako_template): match_string = context.group().lower() if match_string.startswith("= 0) and (open_curly_index < end_curly_index): if mako_template[open_curly_index - 1] == '$': # assume if we find "${" it is the start of the next expression # and we have a parse error return -1 else: return self._find_balanced_end_curly(mako_template, open_curly_index + 1, num_open_curlies + 1) if num_open_curlies == 0: return end_curly_index else: return self._find_balanced_end_curly(mako_template, end_curly_index + 1, num_open_curlies - 1) class UnderscoreTemplateLinter(object): """ The linter for Underscore.js template files. """ _skip_underscore_dirs = _skip_dirs def process_file(self, directory, file_name): """ Process file to determine if it is an Underscore template file and if it is safe. Arguments: directory (string): The directory of the file to be checked file_name (string): A filename for a potential underscore file Returns: The file results containing any violations, or None if the file is never checked. """ if not self._is_underscore_directory(directory): return if not file_name.lower().endswith('.underscore'): return full_path = directory + '/' + file_name return self._load_and_check_underscore_file_is_safe(full_path) def print_results(self, options): """ Prints all results (i.e. violations) for all files that failed this linter. Arguments: options: A list of the options. """ for result in self._results: result.print_results(options) def _is_underscore_directory(self, directory): """ Determines if the provided directory is a directory that could contain Underscore.js template files that need to be linted. Arguments: directory: The directory to be linted. Returns: True if this directory should be linted for Underscore.js template violations and False otherwise. """ if _is_skip_dir(self._skip_underscore_dirs, directory): return False return True def _load_and_check_underscore_file_is_safe(self, file_full_path): """ Loads the Underscore.js template file and checks if it is in violation. Arguments: file_full_path: The file to be loaded and linted Returns: The file results containing any violations, or None if the file is never checked. """ underscore_template = _load_file(self, file_full_path) results = FileResults(file_full_path) self._check_underscore_file_is_safe(underscore_template, results) if len(results.violations) > 0: return results else: return None def _check_underscore_file_is_safe(self, underscore_template, results): """ Checks for violations in an Underscore.js template. Arguments: underscore_template: The contents of the Underscore.js template. results: A file results objects to which violations will be added. """ self._check_underscore_expressions(underscore_template, results) results.prepare_results(underscore_template) def _check_underscore_expressions(self, underscore_template, results): """ Searches for Underscore.js expressions that contain violations. Arguments: underscore_template: The contents of the Underscore.js template. results: A list of results into which violations will be added. """ expressions = self._find_unescaped_expressions(underscore_template) for expression in expressions: results.violations.append(ExpressionRuleViolation( Rules.underscore_not_escaped, expression )) def _find_unescaped_expressions(self, underscore_template): """ Returns a list of unsafe expressions. At this time all expressions that are unescaped are considered unsafe. Arguments: underscore_template: The contents of the Underscore.js template. Returns: A list of dicts for each expression, where the dict contains the following: start_index: The index of the start of the expression. end_index: The index of the end of the expression. expression: The text of the expression. """ unescaped_expression_regex = re.compile("<%=.*?%>") expressions = [] for match in unescaped_expression_regex.finditer(underscore_template): expression = { 'start_index': match.start(), 'end_index': match.end(), 'expression': match.group(), } expressions.append(expression) return expressions def _process_current_walk(current_walk, template_linters, options): """ For each linter, lints all the files in the current os walk. This means finding and printing violations. Arguments: current_walk: A walk returned by os.walk(). template_linters: A list of linting objects. options: A list of the options. """ walk_directory = current_walk[0] walk_files = current_walk[2] for walk_file in walk_files: for template_linter in template_linters: results = template_linter.process_file(walk_directory, walk_file) if results is not None: results.print_results(options) def _process_os_walk(starting_dir, template_linters, options): """ For each linter, lints all the directories in the starting directory. Arguments: starting_dir: The initial directory to begin the walk. template_linters: A list of linting objects. options: A list of the options. """ for current_walk in os.walk(starting_dir): _process_current_walk(current_walk, template_linters, options) def main(): """ Used to execute the linter. Use --help option for help. Prints all of the violations. """ #TODO: Use click if '--help' in sys.argv: print "Check that templates are safe." print "Options:" print " --quiet Just display the filenames with violations." print print "Rules:" for rule in Rules.__members__.values(): print " {0[0]}: {0[1]}".format(rule.value) return is_quiet = '--quiet' in sys.argv options = { 'is_quiet': is_quiet, } template_linters = [MakoTemplateLinter(), UnderscoreTemplateLinter()] _process_os_walk('.', template_linters, options) if __name__ == "__main__": main()