Add stable_bucketer helper tool (#21783)
Add stable_bucketer helper tool
This commit is contained in:
@@ -1,6 +1,11 @@
|
||||
"""
|
||||
An implementation of a stable bucketing algorithm that can be used
|
||||
to reliably group users into experiments.
|
||||
|
||||
An implementation of this is available as a standalone command-line
|
||||
tool, `scripts/stable_bucketer`, which can both validate the
|
||||
bucketing of a username and generate recognizable usernames for
|
||||
particular experiment buckets for testing.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
|
||||
154
scripts/stable_bucketer
Executable file
154
scripts/stable_bucketer
Executable file
@@ -0,0 +1,154 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import random
|
||||
import re
|
||||
import string
|
||||
#####
|
||||
|
||||
|
||||
### Main ###
|
||||
def main(args, env):
|
||||
epilog = "Checks username bucketing for experiments and generates names for each experiment bucket. Derived names include the base user name, experiment abbreviation, bucket number, and a short random string, separated with hyphens. (v1.0)"
|
||||
parser = argparse.ArgumentParser(epilog=epilog)
|
||||
parser.add_argument(
|
||||
"exp",
|
||||
metavar="EXPERIMENT",
|
||||
help="Experiment to bucket for.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"user",
|
||||
nargs="?",
|
||||
default=env.get("USER", ""),
|
||||
metavar="NAME",
|
||||
help="Base user name for bucketing, default is $USER.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-a", "--abbrev",
|
||||
metavar="EXP",
|
||||
help="Experiment abbreviation for name generation.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-b", "--buckets",
|
||||
nargs="+",
|
||||
type=int,
|
||||
metavar="X",
|
||||
help="Buckets to make names for, default is all buckets.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c", "--check-only",
|
||||
action="store_true",
|
||||
help="Just check what bucket the user is in, don't generate names.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-n", "--number",
|
||||
type=int,
|
||||
default=2,
|
||||
metavar="N",
|
||||
help="Number of buckets, default is 2.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--print-args",
|
||||
action="store_true",
|
||||
# help="Print arguments and computations, then exit.",
|
||||
help=argparse.SUPPRESS,
|
||||
)
|
||||
my_args = parser.parse_args(sys.argv[1:])
|
||||
bucket_number = my_args.number
|
||||
|
||||
hashed = hash_exp(my_args.exp, my_args.user)
|
||||
digest = bucket_int(hashed)
|
||||
bucket = digest % bucket_number
|
||||
|
||||
abbrev = my_args.abbrev
|
||||
if abbrev is None:
|
||||
abbrev = abbreviate(my_args.exp)
|
||||
|
||||
print("{user} is in bucket: {bucket}".format(user=my_args.user, bucket=bucket))
|
||||
if my_args.print_args:
|
||||
print("* Args:\n\t{my_args}\n* Computed:\n\tdigest: {digest} - hash: {hashed} - {abbrev}".format(**vars()))
|
||||
return 0
|
||||
|
||||
if my_args.check_only:
|
||||
return 0
|
||||
|
||||
bucket_list = my_args.buckets
|
||||
if not bucket_list:
|
||||
bucket_list = range(bucket_number)
|
||||
|
||||
# TODO: validate more of the arguments
|
||||
# HACK: currently not enforcing the naming rules:
|
||||
# - Username must be between 2 and 30 characters long.
|
||||
# - Usernames can only contain letters (A-Z, a-z), numerals (0-9), underscores (_), and hyphens (-).)
|
||||
print("Generated names:")
|
||||
for i in bucket_list:
|
||||
if i >= bucket_number:
|
||||
print(" (Skipped {i}, experiment only has {bucket_number} buckets)".format(**vars()))
|
||||
continue
|
||||
print(" " + name_for(i, abbrev, my_args.exp, my_args.user, bucket_number))
|
||||
|
||||
return 0
|
||||
#####
|
||||
|
||||
|
||||
### Helpers ###
|
||||
def hash_exp(exp, name):
|
||||
hasher = hashlib.md5()
|
||||
hasher.update(exp.encode("utf-8"))
|
||||
hasher.update(name.encode("utf-8"))
|
||||
return hasher.hexdigest()
|
||||
|
||||
|
||||
def bucket_int(hashed):
|
||||
s = re.sub("[0-7]", "0", hashed)
|
||||
s = re.sub("[8-9a-f]", "1", s)
|
||||
return int(s, 2)
|
||||
|
||||
|
||||
def name_for(bucket, abbrev, exp, name, number):
|
||||
if abbrev:
|
||||
abbrev += "-"
|
||||
name_base = "{name}-{abbrev}{bucket}-".format(**vars())
|
||||
tries = 100 * number
|
||||
for _ in range(tries):
|
||||
s = "".join([random.choice(string.digits + string.ascii_lowercase) for _ in range(5)])
|
||||
n = name_base + s
|
||||
b = bucket_int(hash_exp(exp, n)) % number
|
||||
if bucket == b:
|
||||
return n
|
||||
else:
|
||||
raise RuntimeError("Failed to generate a name for bucket {bucket} in {tries} tries".format(**vars()))
|
||||
|
||||
|
||||
def abbreviate(exp):
|
||||
"Deterministically creates a ~3-6 letter abbreviation, using initials and trying to stay as recognizable as possible"
|
||||
s = re.sub(r"[^0-9A-Za-z]+", "-", exp) # drop symbols that aren't allowed in usernames (_ and - are allowed; this collapses them into -, simplifying some of the following)
|
||||
if len(re.findall(r"(^|[-])\w", s)) >= 3:
|
||||
# found at least a few word separators, use initials
|
||||
s = re.sub(r"(^|[-])(\w)[^-]*", r"\2", s).lower()
|
||||
s = re.sub(r"[-]", "", s) # drop stray separators
|
||||
elif len(re.findall(r"[A-Z][^A-Z]+", s)) >= 3:
|
||||
# found at least a few capitalizations, use as initials, strip lowercase and junk
|
||||
s = re.sub(r"[a-z-]", "", s).lower()
|
||||
else:
|
||||
s = re.sub(r"[-]", "", s).lower() # drop junk
|
||||
if len(s) > 6:
|
||||
# drop vowels except first & last and let the shortener trim it down from there
|
||||
s = s[0] + re.sub(r"[aeiou]", "", s[1:-1]) + s[-1]
|
||||
|
||||
if len(s) > 6:
|
||||
# shorten abbreviation, keeping the beginning, middle, and last characters to preserve recognizability
|
||||
half = (len(s) - 1) // 2 # -1 to bias toward early-middle letters
|
||||
s = s[:2] + s[half - 1:half + 2] + s[-1:]
|
||||
return s
|
||||
#####
|
||||
|
||||
|
||||
#####
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
import sys
|
||||
xit = main(sys.argv, os.environ)
|
||||
sys.exit(xit)
|
||||
#####
|
||||
Reference in New Issue
Block a user