commit b5231a079f27c9cf4061314b67be0b48a38a517a
parent a0f9ebae95c7485f207ff67d3603d90133f0fd33
Author: amin <dev@aminmesbah.com>
Date:   Wed,  7 Sep 2016 23:31:13 +0000
Add function to map a word to a grouping scheme. Simplify control flow.
FossilOrigin-Name: 0ede8311bbcec66575422d99bb203bd6a322e729876c6cc38085cba7b87f5d09
Diffstat:
| M | speller.py |  |  | 142 | ++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------- | 
| M | tests.py |  |  | 37 | ++++++++++++++++++++++++++++--------- | 
2 files changed, 121 insertions(+), 58 deletions(-)
diff --git a/speller.py b/speller.py
@@ -1,21 +1,35 @@
 # TODO:
-# test that all letters in word are present in some element
-# generate group_maps only for the exact number of chars in word
-#  
-
+# - eliminate unnecessary functions
+# - simplify
+# - use consistent terminology
+import csv
 from collections import namedtuple
 from itertools import chain, product
-import csv
-import sys
-
-
-def get_csv_data(file_name, column):
+from pprint import pprint
+
+ELEMENTS = (
+    'Ac', 'Ag', 'Al', 'Am', 'Ar', 'As', 'At', 'Au', 'B', 'Ba', 'Be', 'Bh',
+    'Bi', 'Bk', 'Br', 'C', 'Ca', 'Cd', 'Ce', 'Cf', 'Cl', 'Cm', 'Co', 'Cr',
+    'Cs', 'Cu', 'Db', 'Ds', 'Dy', 'Er', 'Es', 'Eu', 'F', 'Fe', 'Fm', 'Fr',
+    'Ga', 'Gd', 'Ge', 'H', 'He', 'Hf', 'Hg', 'Ho', 'Hs', 'I', 'In', 'Ir',
+    'K', 'Kr', 'La', 'Li', 'Lr', 'Lu', 'Md', 'Mg', 'Mn', 'Mo', 'Mt', 'N',
+    'Na', 'Nb', 'Nd', 'Ne', 'Ni', 'No', 'Np', 'O', 'Os', 'P', 'Pa', 'Pb',
+    'Pd', 'Pm', 'Po', 'Pr', 'Pt', 'Pu', 'Ra', 'Rb', 'Re', 'Rf', 'Rg', 'Rh',
+    'Rn', 'Ru', 'S', 'Sb', 'Sc', 'Se', 'Sg', 'Si', 'Sm', 'Sn', 'Sr', 'Ta',
+    'Tb', 'Tc', 'Te', 'Th', 'Ti', 'Tl', 'Tm', 'U', 'Uub', 'Uuh', 'Uuo',
+    'Uup', 'Uuq', 'Uus', 'Uut', 'V', 'W', 'Xe', 'Y', 'Yb', 'Zn', 'Zr'
+)
+
+
+def get_csv_data(file_name, column, header=True):
     """Return in a list all data from a given column of a .csv file"""
+
     data = []
 
     with open(file_name) as infile:
         csv_reader = csv.reader(infile, skipinitialspace=True, delimiter=',')
-        next(csv_reader, None)
+        if header:
+            next(csv_reader, None)  # skip header row
         for row in csv_reader:
             data.append(row[column])
 
@@ -24,64 +38,94 @@ def get_csv_data(file_name, column):
 
 def tokenize_sequence(sequence):
     """Return a list each of all single and double character tokens."""
-    t = namedtuple('Tokens', (['single', 'pair']))
 
-    single = [sequence[i:i+1] for i in range(0, len(sequence))]
-    pair = [sequence[i:i+2] for i in range(0, len(sequence) - 1)]
-    tokens = t(single, pair)
+    Tokens = namedtuple('Tokens', (['singles', 'doubles']))
+
+    singles = tuple(sequence[i] for i in range(0, len(sequence)))
+    doubles = tuple(sequence[i:i+2] for i in range(0, len(sequence) - 1))
 
-    return tokens
+    return Tokens(singles, doubles)
 
 
 def find_matches(sequence, symbols):
-    """Return a list of all element symbols matching
-    an item in the given sequence.
+    """Return a dictionary of symbols and indices for all
+    symbols that match an item in the given sequence.
     """
-    matches = []
-    indices = []
-    lower_symbols = [i.lower() for i in symbols]
-    lower_sequence = [i.lower() for i in sequence]
 
-    for i in lower_sequence:
-        matches += (x for x in lower_symbols if x == i)
-        indices += (lower_symbols.index(x) for x in lower_symbols if x == i)
+    return {
+        symbol: index
+        for character in sequence
+        for index, symbol in enumerate(symbols)
+        if symbol.lower() == character.lower()
+    }
 
-    return matches
 
+def groupings(word, token_sizes=(1, 2)):
+    """Return a tuple of all permutations of possible character
+    grouping arrangements of a word.
 
-def groupings(word, group_sizes = [1,2]):
-    """Return a list of all permutations of possible character grouping
-    arrangements of a word. group_sizes defines the possible sizes of 
-    character groups, and by default allows only singles and pairs.
+    token_sizes defines the possible sizes of character groups,
+    and by default allows only singles and pairs.
     """
-    group_maps = []
-    length = len(word)
-    cartesian_product = (product(group_sizes, repeat=r)
-                         for r in range(1, length + 1))
-    products = chain.from_iterable(cartesian_product)
 
-    # include only products that represent the correct number of chars
-    for p in products:
-        if sum(p) == length:
-            p = [tuple(x for x in p)]
-            for x in p:
-                if x not in group_maps:
-                    group_maps.append(x)
+    cartesian_products = (
+        product(token_sizes, repeat=r)
+        for r in range(1, len(word) + 1)
+    )
+
+    # include only groupings that represent the correct number of chars
+    groupings = tuple(
+        grouping
+        for grouping in chain.from_iterable(cartesian_products)
+        if sum(grouping) == len(word)
+    )
 
-    return group_maps 
+    return groupings
 
 
-def main():
+def map_word(word, grouping):
+    """Given a word and a grouping, map the characters of the word
+    to match the distribution defined in the grouping.
+
+    example:
+    >>> map_word('because', (1, 2, 1, 1, 2))
+    ['b', 'ec', 'a', 'u', 'se']
+    """
+
+    word_chars = (c for c in word)
+
+    mapped = []
+    for char_group_size in grouping:
+        char_group = ""
+        for _ in range(char_group_size):
+            char_group += next(word_chars)
+        mapped.append(char_group)
+
+    return tuple(mapped)
+
+
+if __name__ == '__main__':
     symbols = get_csv_data('elements.csv', 1)
 
-    test_word = "Because"
+    test_word = 'Osiris'
 
     tokens = tokenize_sequence(test_word)
-    single_matches = find_matches(tokens.single, symbols)
-    pair_matches = find_matches(tokens.pair, symbols)
 
-    print(single_matches, pair_matches)
+    single_matches = find_matches(tokens.singles, symbols)
+    pair_matches = find_matches(tokens.doubles, symbols)
 
+    letter_groupings = groupings(test_word)
 
-if __name__ == '__main__':
-    main()
+    spellings = [map_word(test_word, g) for g in letter_groupings]
+
+    elemental_spellings = [
+        [l.capitalize() for l in spelling]
+        for spelling in spellings
+        if set(c.lower() for c in spelling) <= set(s.lower() for s in symbols)
+    ]
+
+    pprint(tokens)
+    pprint(single_matches)
+    pprint(pair_matches)
+    pprint(list(zip(letter_groupings, spellings)))
+    pprint(elemental_spellings)
diff --git a/tests.py b/tests.py
@@ -1,20 +1,38 @@
 import speller
 import unittest
 
+# TODO: change to py.test syntax
+
+ELEMENTS = (
+    'Ac', 'Ag', 'Al', 'Am', 'Ar', 'As', 'At', 'Au', 'B', 'Ba', 'Be', 'Bh',
+    'Bi', 'Bk', 'Br', 'C', 'Ca', 'Cd', 'Ce', 'Cf', 'Cl', 'Cm', 'Co', 'Cr',
+    'Cs', 'Cu', 'Db', 'Ds', 'Dy', 'Er', 'Es', 'Eu', 'F', 'Fe', 'Fm', 'Fr',
+    'Ga', 'Gd', 'Ge', 'H', 'He', 'Hf', 'Hg', 'Ho', 'Hs', 'I', 'In', 'Ir',
+    'K', 'Kr', 'La', 'Li', 'Lr', 'Lu', 'Md', 'Mg', 'Mn', 'Mo', 'Mt', 'N',
+    'Na', 'Nb', 'Nd', 'Ne', 'Ni', 'No', 'Np', 'O', 'Os', 'P', 'Pa', 'Pb',
+    'Pd', 'Pm', 'Po', 'Pr', 'Pt', 'Pu', 'Ra', 'Rb', 'Re', 'Rf', 'Rg', 'Rh',
+    'Rn', 'Ru', 'S', 'Sb', 'Sc', 'Se', 'Sg', 'Si', 'Sm', 'Sn', 'Sr', 'Ta',
+    'Tb', 'Tc', 'Te', 'Th', 'Ti', 'Tl', 'Tm', 'U', 'Uub', 'Uuh', 'Uuo',
+    'Uup', 'Uuq', 'Uus', 'Uut', 'V', 'W', 'Xe', 'Y', 'Yb', 'Zn', 'Zr'
+)
 
 class MatchingTest(unittest.TestCase):
     test_singles = ['B', 'e', 'c', 'a', 'u', 's', 'e']
     test_pairs = ['Be', 'ec', 'ca', 'au', 'se']
-    #TODO: change this so it never fails
-    symbols = speller.get_csv_data('elements.csv', 1)
 
     def test_match_singles(self):
-        matches = speller.find_matches(self.test_singles, self.symbols)
-        self.assertEqual(matches, ['b', 'c', 'u', 's'])
+        matches = speller.find_matches(self.test_singles, ELEMENTS)
+        self.assertEqual(
+            matches,
+            {'S': 86, 'B': 8, 'U': 103, 'C': 15}
+        )
 
     def test_match_pairs(self):
-        matches = speller.find_matches(self.test_pairs, self.symbols)
-        self.assertEqual(matches, ['be', 'ca', 'au', 'se'])
+        matches = speller.find_matches(self.test_pairs, ELEMENTS)
+        self.assertEqual(
+            matches,
+            {'Au': 7, 'Be': 10, 'Ca': 16, 'Se': 89}
+        )
 
 
 class TokensTest(unittest.TestCase):
@@ -22,24 +40,25 @@ class TokensTest(unittest.TestCase):
 
     def test_single_chars(self):
         tokens = speller.tokenize_sequence(self.test_word)
-        self.assertEqual(tokens.single, ["O", "s", "i", "r", "i", "s"])
+        self.assertEqual(tokens.singles, ("O", "s", "i", "r", "i", "s"))
 
     def test_pair_chars(self):
         tokens = speller.tokenize_sequence(self.test_word)
-        self.assertEqual(tokens.pair, ["Os", "si", "ir", "ri", "is"])
+        self.assertEqual(tokens.doubles, ("Os", "si", "ir", "ri", "is"))
 
 
 class GroupingTest(unittest.TestCase):
     word = "that"
 
     def test_singles_and_pairs(self):
-        expected_maps = [(2, 2), (1, 1, 2), (1, 2, 1), (2, 1, 1), (1,1,1,1)]
+        expected_maps = ((2, 2), (1, 1, 2), (1, 2, 1), (2, 1, 1), (1, 1, 1, 1))
         group_maps = speller.groupings(self.word)
         self.assertEqual(group_maps, expected_maps)
 
 
 class FileTest(unittest.TestCase):
     file_name = "elements.csv"
+
     proper_data = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne',
                    'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca',
                    'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn',