161 lines
5.0 KiB
Python
Executable File
161 lines
5.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import argparse
|
|
import hashlib
|
|
import random
|
|
import re
|
|
import string
|
|
#####
|
|
|
|
|
|
### Main ###
|
|
def main(args, env):
|
|
epilog = "Checks username bucketing for experiments and generates names for each experiment bucket. Derived names include the base user name, experiment abbreviation, bucket number, and a short random string, separated with hyphens. (v1.0)"
|
|
parser = argparse.ArgumentParser(epilog=epilog)
|
|
parser.add_argument(
|
|
"exp",
|
|
metavar="EXPERIMENT",
|
|
help="Experiment to bucket for.",
|
|
)
|
|
parser.add_argument(
|
|
"user",
|
|
nargs="?",
|
|
default=env.get("USER", ""),
|
|
metavar="NAME",
|
|
help="Base user name for bucketing, default is $USER.",
|
|
)
|
|
parser.add_argument(
|
|
"-a", "--abbrev",
|
|
metavar="EXP",
|
|
help="Experiment abbreviation for name generation.",
|
|
)
|
|
parser.add_argument(
|
|
"-b", "--buckets",
|
|
nargs="+",
|
|
type=int,
|
|
metavar="X",
|
|
help="Buckets to make names for, default is all buckets.",
|
|
)
|
|
parser.add_argument(
|
|
"-c", "--check-only",
|
|
action="store_true",
|
|
help="Just check what bucket the user is in, don't generate names.",
|
|
)
|
|
parser.add_argument(
|
|
"-n", "--number",
|
|
type=int,
|
|
default=2,
|
|
metavar="N",
|
|
help="Number of buckets, default is 2.",
|
|
)
|
|
parser.add_argument(
|
|
"--print-args",
|
|
action="store_true",
|
|
# help="Print arguments and computations, then exit.",
|
|
help=argparse.SUPPRESS,
|
|
)
|
|
my_args = parser.parse_args(sys.argv[1:])
|
|
bucket_number = my_args.number
|
|
|
|
hashed = hash_exp(my_args.exp, my_args.user)
|
|
digest = bucket_int(hashed)
|
|
bucket = digest % bucket_number
|
|
|
|
abbrev = my_args.abbrev
|
|
if abbrev is None:
|
|
abbrev = abbreviate(my_args.exp)
|
|
|
|
print(
|
|
"For experiment {exp}, {user} is in bucket: {bucket}".format(
|
|
exp=my_args.exp,
|
|
user=my_args.user,
|
|
bucket=bucket,
|
|
)
|
|
)
|
|
if my_args.print_args:
|
|
print("* Args:\n\t{my_args}\n* Computed:\n\tdigest: {digest} - hash: {hashed} - {abbrev}".format(**vars()))
|
|
return 0
|
|
|
|
if my_args.check_only:
|
|
return 0
|
|
|
|
bucket_list = my_args.buckets
|
|
if not bucket_list:
|
|
bucket_list = range(bucket_number)
|
|
|
|
# TODO: validate more of the arguments
|
|
# HACK: currently not enforcing the naming rules:
|
|
# - Username must be between 2 and 30 characters long.
|
|
# - Usernames can only contain letters (A-Z, a-z), numerals (0-9), underscores (_), and hyphens (-).)
|
|
print("Generated names:")
|
|
for i in bucket_list:
|
|
if i >= bucket_number:
|
|
print(" (Skipped {i}, experiment only has {bucket_number} buckets)".format(**vars()))
|
|
continue
|
|
print(" " + name_for(i, abbrev, my_args.exp, my_args.user, bucket_number))
|
|
|
|
return 0
|
|
#####
|
|
|
|
|
|
### Helpers ###
|
|
def hash_exp(exp, name):
|
|
hasher = hashlib.md5()
|
|
hasher.update(exp.encode("utf-8"))
|
|
hasher.update(name.encode("utf-8"))
|
|
return hasher.hexdigest()
|
|
|
|
|
|
def bucket_int(hashed):
|
|
s = re.sub("[0-7]", "0", hashed)
|
|
s = re.sub("[8-9a-f]", "1", s)
|
|
return int(s, 2)
|
|
|
|
|
|
def name_for(bucket, abbrev, exp, name, number):
|
|
if abbrev:
|
|
abbrev += "-"
|
|
name_base = "{name}-{abbrev}{bucket}-".format(**vars())
|
|
tries = 100 * number
|
|
for _ in range(tries):
|
|
s = "".join([random.choice(string.digits + string.ascii_lowercase) for _ in range(5)])
|
|
n = name_base + s
|
|
b = bucket_int(hash_exp(exp, n)) % number
|
|
if bucket == b:
|
|
return n
|
|
else:
|
|
raise RuntimeError("Failed to generate a name for bucket {bucket} in {tries} tries".format(**vars()))
|
|
|
|
|
|
def abbreviate(exp):
|
|
"Deterministically creates a ~3-6 letter abbreviation, using initials and trying to stay as recognizable as possible"
|
|
s = re.sub(r"[^0-9A-Za-z]+", "-", exp) # drop symbols that aren't allowed in usernames (_ and - are allowed; this collapses them into -, simplifying some of the following)
|
|
if len(re.findall(r"(^|[-])\w", s)) >= 3:
|
|
# found at least a few word separators, use initials
|
|
s = re.sub(r"(^|[-])(\w)[^-]*", r"\2", s).lower()
|
|
s = re.sub(r"[-]", "", s) # drop stray separators
|
|
elif len(re.findall(r"[A-Z][^A-Z]+", s)) >= 3:
|
|
# found at least a few capitalizations, use as initials, strip lowercase and junk
|
|
s = re.sub(r"[a-z-]", "", s).lower()
|
|
else:
|
|
s = re.sub(r"[-]", "", s).lower() # drop junk
|
|
if len(s) > 6:
|
|
# drop vowels except first & last and let the shortener trim it down from there
|
|
s = s[0] + re.sub(r"[aeiou]", "", s[1:-1]) + s[-1]
|
|
|
|
if len(s) > 6:
|
|
# shorten abbreviation, keeping the beginning, middle, and last characters to preserve recognizability
|
|
half = (len(s) - 1) // 2 # -1 to bias toward early-middle letters
|
|
s = s[:2] + s[half - 1:half + 2] + s[-1:]
|
|
return s
|
|
#####
|
|
|
|
|
|
#####
|
|
if __name__ == "__main__":
|
|
import os
|
|
import sys
|
|
xit = main(sys.argv, os.environ)
|
|
sys.exit(xit)
|
|
#####
|