Skip to content

Functionality to Identify and Assign New Aliases #11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
71 changes: 68 additions & 3 deletions src/pango_aliasor/aliasor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#%%
class Aliasor:
def __init__(self, alias_file=None):
import json
Expand All @@ -24,17 +23,29 @@ def __init__(self, alias_file=None):

self.realias_dict = {v: k for k, v in self.alias_dict.items()}

def compress(self, name):
def compress(self, name, assign=False):
"""
Returns the compressed lineage name.
Set assign to True to automatically define new aliases for otherwise unhandled designations.
For example, if you want to compress 'BA.5.2.5.6', and BA.5.2.5 does not have an accepted alias,
it will assign BA.5.2.5 to the next available code (in this example, EN) and return EN.6.
"""
name_split = name.split(".")
levels = len(name_split) - 1
num_indirections = (levels - 1) // 3
if num_indirections <= 0:
return name
alias = ".".join(name_split[0 : (3 * num_indirections + 1)])
ending = ".".join(name_split[(3 * num_indirections + 1) :])
if assign and alias not in self.realias_dict:
#note- this cannot produce lineage aliases prefixed with X, which are handled separately as they represent recombinants.
self.assign_alias(alias)
return self.realias_dict[alias] + "." + ending

def uncompress(self, name):
"""
Returns the uncompressed lineage name.
"""
name_split = name.split(".")
letter = name_split[0]
try:
Expand Down Expand Up @@ -89,6 +100,60 @@ def partial_compress(self, name, up_to: int = 0, accepted_aliases: set = {}):
if name_split[(3 * up_to + 1) :] == []:
return alias
return alias + "." + ".".join(name_split[(3 * up_to + 1) :])

@staticmethod
def _charToB(char):
return ord(char)-65

@staticmethod
def _bToChar(n, banned='IOX'):
l = chr(n+65)
while l in banned:
n += 1
l = chr(n+65)
return l

@staticmethod
def _numberToString(n, b=26, banned='IOX'):
#convert the number to base 26
if n == 0:
return [0]
digits = []
while n:
digits.append(int(n % b))
n //= b
#convert the base 26 to an alphabet string, incrementing past banned characters
return "".join([Aliasor._bToChar(d,banned) for d in digits[::-1]])

@staticmethod
def _stringToNumber(cstr, b=26):
#convert the string to a base26 number
digits = [Aliasor._charToB(c) for c in cstr]
#add the digits up to make a base10 number
num = 0
level = 0
for d in digits[::-1]:
num += d * b**level
level += 1
return num

# %%
def next_available_alias(self, recombinant=False):
"""
Returns the next available alias string.
Tracks recombinants separately; set recombinant to True to get the next available recombinant alias.
"""
if recombinant:
current = [Aliasor._stringToNumber(k[1:]) for k in self.alias_dict.keys() if k[0] == 'X']
return 'X' + Aliasor._numberToString(max(current) + 1)
else:
current = [Aliasor._stringToNumber(k) for k in self.alias_dict.keys() if k[0] != 'X']
return Aliasor._numberToString(max(current) + 1)

def assign_alias(self, name, recombinant=False):
"""
Assigns the input name to the next available alias.
Set recombinant to True to assign it to the next recombinant alias.
"""
nextn = self.next_available_alias(recombinant)
self.alias_dict[nextn] = name
self.realias_dict[name] = nextn
21 changes: 20 additions & 1 deletion tests/test_aliasor.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,4 +90,23 @@ def test_parent():
assert aliasor.parent('') == ''
assert aliasor.parent('A') == ''
assert aliasor.parent('B') == ''
assert aliasor.parent('C.1') == 'B.1.1.1'
assert aliasor.parent('C.1') == 'B.1.1.1'

def test_next_compression():
aliasor = Aliasor()
n1 = aliasor.next_available_alias()
assert n1 not in aliasor.alias_dict.keys()
n2 = aliasor.next_available_alias(True)
assert n2[0] == 'X'
assert n1 != n2

def test_assign_compression():
aliasor = Aliasor()
n1 = aliasor.next_available_alias()
aliasor.assign_alias('test')
assert aliasor.realias_dict['test'] == n1
assert aliasor.alias_dict[n1] == 'test'
n2 = aliasor.next_available_alias(True)
aliasor.assign_alias('test2',True)
assert aliasor.realias_dict['test2'] == n2
assert aliasor.alias_dict[n2] == 'test2'