#!/usr/bin/env python3 import argparse import hashlib import random import re import string ##### ### Main ### def main(args, env): epilog = "Checks username bucketing for experiments and generates names for each experiment bucket. Derived names include the base user name, experiment abbreviation, bucket number, and a short random string, separated with hyphens. (v1.0)" parser = argparse.ArgumentParser(epilog=epilog) parser.add_argument( "exp", metavar="EXPERIMENT", help="Experiment to bucket for.", ) parser.add_argument( "user", nargs="?", default=env.get("USER", ""), metavar="NAME", help="Base user name for bucketing, default is $USER.", ) parser.add_argument( "-a", "--abbrev", metavar="EXP", help="Experiment abbreviation for name generation.", ) parser.add_argument( "-b", "--buckets", nargs="+", type=int, metavar="X", help="Buckets to make names for, default is all buckets.", ) parser.add_argument( "-c", "--check-only", action="store_true", help="Just check what bucket the user is in, don't generate names.", ) parser.add_argument( "-n", "--number", type=int, default=2, metavar="N", help="Number of buckets, default is 2.", ) parser.add_argument( "--print-args", action="store_true", # help="Print arguments and computations, then exit.", help=argparse.SUPPRESS, ) my_args = parser.parse_args(sys.argv[1:]) bucket_number = my_args.number hashed = hash_exp(my_args.exp, my_args.user) digest = bucket_int(hashed) bucket = digest % bucket_number abbrev = my_args.abbrev if abbrev is None: abbrev = abbreviate(my_args.exp) print( "For experiment {exp}, {user} is in bucket: {bucket}".format( exp=my_args.exp, user=my_args.user, bucket=bucket, ) ) if my_args.print_args: print("* Args:\n\t{my_args}\n* Computed:\n\tdigest: {digest} - hash: {hashed} - {abbrev}".format(**vars())) return 0 if my_args.check_only: return 0 bucket_list = my_args.buckets if not bucket_list: bucket_list = range(bucket_number) # TODO: validate more of the arguments # HACK: currently not enforcing the naming rules: # - Username must be between 2 and 30 characters long. # - Usernames can only contain letters (A-Z, a-z), numerals (0-9), underscores (_), and hyphens (-).) print("Generated names:") for i in bucket_list: if i >= bucket_number: print(" (Skipped {i}, experiment only has {bucket_number} buckets)".format(**vars())) continue print(" " + name_for(i, abbrev, my_args.exp, my_args.user, bucket_number)) return 0 ##### ### Helpers ### def hash_exp(exp, name): hasher = hashlib.md5() hasher.update(exp.encode("utf-8")) hasher.update(name.encode("utf-8")) return hasher.hexdigest() def bucket_int(hashed): s = re.sub("[0-7]", "0", hashed) s = re.sub("[8-9a-f]", "1", s) return int(s, 2) def name_for(bucket, abbrev, exp, name, number): if abbrev: abbrev += "-" name_base = "{name}-{abbrev}{bucket}-".format(**vars()) tries = 100 * number for _ in range(tries): s = "".join([random.choice(string.digits + string.ascii_lowercase) for _ in range(5)]) n = name_base + s b = bucket_int(hash_exp(exp, n)) % number if bucket == b: return n else: raise RuntimeError("Failed to generate a name for bucket {bucket} in {tries} tries".format(**vars())) def abbreviate(exp): "Deterministically creates a ~3-6 letter abbreviation, using initials and trying to stay as recognizable as possible" s = re.sub(r"[^0-9A-Za-z]+", "-", exp) # drop symbols that aren't allowed in usernames (_ and - are allowed; this collapses them into -, simplifying some of the following) if len(re.findall(r"(^|[-])\w", s)) >= 3: # found at least a few word separators, use initials s = re.sub(r"(^|[-])(\w)[^-]*", r"\2", s).lower() s = re.sub(r"[-]", "", s) # drop stray separators elif len(re.findall(r"[A-Z][^A-Z]+", s)) >= 3: # found at least a few capitalizations, use as initials, strip lowercase and junk s = re.sub(r"[a-z-]", "", s).lower() else: s = re.sub(r"[-]", "", s).lower() # drop junk if len(s) > 6: # drop vowels except first & last and let the shortener trim it down from there s = s[0] + re.sub(r"[aeiou]", "", s[1:-1]) + s[-1] if len(s) > 6: # shorten abbreviation, keeping the beginning, middle, and last characters to preserve recognizability half = (len(s) - 1) // 2 # -1 to bias toward early-middle letters s = s[:2] + s[half - 1:half + 2] + s[-1:] return s ##### ##### if __name__ == "__main__": import os import sys xit = main(sys.argv, os.environ) sys.exit(xit) #####