commit f1b8432917813c4e02bb27001f7ffae89cb9df6b
parent 9ea58332f416ac0a8994b3b2718e5264a7d120a0
Author: Amin Mesbah <mesbah.amin@gmail.com>
Date:   Wed,  7 Sep 2016 20:14:09 -0700
Remove all unnecessary functionality. Use py.test.
Diffstat:
| A | Makefile | | | 21 | +++++++++++++++++++++ | 
| A | dev_requirements.txt | | | 6 | ++++++ | 
| A | elemental_speller.py | | | 98 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | 
| D | speller.py | | | 131 | ------------------------------------------------------------------------------- | 
| M | tests.py | | | 78 | ++++++++++++++++++++---------------------------------------------------------- | 
5 files changed, 145 insertions(+), 189 deletions(-)
diff --git a/Makefile b/Makefile
@@ -0,0 +1,21 @@
+DATADIR = data/
+DATE = `date +%Y-%m-%d`
+
+init:
+	pip install -r dev_requirements.txt
+
+test:
+	# To run individual tests, use "py.test -k the_test_path"
+	py.test tests.py
+
+lint:
+	flake8 *.py
+
+watch-log:
+	tail -f debug.log
+
+loc:
+	cloc --by-file --include-lang=Python .
+
+todo:
+	grep -FR --ignore-case --binary-file=without-match todo *.py
diff --git a/dev_requirements.txt b/dev_requirements.txt
@@ -0,0 +1,6 @@
+flake8==3.0.4
+mccabe==0.5.2
+py==1.4.31
+pycodestyle==2.0.0
+pyflakes==1.2.3
+pytest==3.0.2
diff --git a/elemental_speller.py b/elemental_speller.py
@@ -0,0 +1,98 @@
+# TODO: add logging
+
+from collections import namedtuple
+from itertools import chain, product
+
+ELEMENTS = (
+    'Ac', 'Ag', 'Al', 'Am', 'Ar', 'As', 'At', 'Au', 'B', 'Ba', 'Be', 'Bh',
+    'Bi', 'Bk', 'Br', 'C', 'Ca', 'Cd', 'Ce', 'Cf', 'Cl', 'Cm', 'Co', 'Cr',
+    'Cs', 'Cu', 'Db', 'Ds', 'Dy', 'Er', 'Es', 'Eu', 'F', 'Fe', 'Fm', 'Fr',
+    'Ga', 'Gd', 'Ge', 'H', 'He', 'Hf', 'Hg', 'Ho', 'Hs', 'I', 'In', 'Ir',
+    'K', 'Kr', 'La', 'Li', 'Lr', 'Lu', 'Md', 'Mg', 'Mn', 'Mo', 'Mt', 'N',
+    'Na', 'Nb', 'Nd', 'Ne', 'Ni', 'No', 'Np', 'O', 'Os', 'P', 'Pa', 'Pb',
+    'Pd', 'Pm', 'Po', 'Pr', 'Pt', 'Pu', 'Ra', 'Rb', 'Re', 'Rf', 'Rg', 'Rh',
+    'Rn', 'Ru', 'S', 'Sb', 'Sc', 'Se', 'Sg', 'Si', 'Sm', 'Sn', 'Sr', 'Ta',
+    'Tb', 'Tc', 'Te', 'Th', 'Ti', 'Tl', 'Tm', 'U', 'Uub', 'Uuh', 'Uuo',
+    'Uup', 'Uuq', 'Uus', 'Uut', 'V', 'W', 'Xe', 'Y', 'Yb', 'Zn', 'Zr'
+)
+
+
+def elemental_spelling(word, symbols=ELEMENTS):
+    """Given a word and a sequence of symbols (tokens),
+    return a list of any possible ways to spell that word
+    with those symbols.
+
+    Example:
+    >>> elemental_spelling('amputation')
+    [(('Am', 'Pu', 'Ta', 'Ti', 'O', 'N'), ('Am', 'P', 'U', 'Ta', 'Ti', 'O', 'N')]
+    """
+    letter_groupings = _groupings(len(word))
+
+    spellings = [_map_word(word, grouping) for grouping in letter_groupings]
+
+    elemental_spellings = [
+        tuple(token.capitalize() for token in spelling)
+        for spelling in spellings
+        # set operation: set of chars in spelling is subset of set of symbols
+        if set(s.lower() for s in spelling) <= set(s.lower() for s in symbols)
+    ]
+
+    return elemental_spellings
+
+
+def _groupings(word_length, token_sizes=(1, 2, 3)):
+    """Return a tuple of all character groupings for a word
+    of a given length.
+
+    A character grouping is a tuple representing the distribution
+    of characters in a tokenized word.
+
+    The word 'canary', if mapped to the grouping (1, 3, 2), would
+    be broken down into ['c', 'ana', 'ry'].
+
+    token_sizes defines the possible sizes of character groups,
+    and by default allows only singles, pairs, and triplets.
+
+    Example:
+    >>> _groupings(4, token_sizes=(1, 2))
+    ((2, 2), (1, 1, 2), (1, 2, 1), (2, 1, 1), (1, 1, 1, 1))
+    """
+
+    cartesian_products = (
+        product(token_sizes, repeat=r)
+        for r in range(1, word_length + 1)
+    )
+
+    # include only groupings that represent the correct number of chars
+    groupings = tuple(
+        grouping
+        for grouping in chain.from_iterable(cartesian_products)
+        if sum(grouping) == word_length
+    )
+
+    return groupings
+
+
+def _map_word(word, grouping):
+    """Return a tuple of tokens: word mapped to a grouping.
+
+    Example:
+    >>> _map_word('because', (1, 2, 1, 1, 2))
+    ('b', 'ec', 'a', 'u', 'se')
+    """
+
+    word_chars = (c for c in word)
+
+    mapped = []
+    for char_group_size in grouping:
+        char_group = ""
+        for _ in range(char_group_size):
+            char_group += next(word_chars)
+        mapped.append(char_group)
+
+    return tuple(mapped)
+
+
+if __name__ == '__main__':
+    test_word = 'Mockery'
+    print('{}:\n{}'.format(test_word, elemental_spelling(test_word)))
diff --git a/speller.py b/speller.py
@@ -1,131 +0,0 @@
-# TODO:
-# - eliminate unnecessary functions
-# - simplify
-# - use consistent terminology
-import csv
-from collections import namedtuple
-from itertools import chain, product
-from pprint import pprint
-
-ELEMENTS = (
-    'Ac', 'Ag', 'Al', 'Am', 'Ar', 'As', 'At', 'Au', 'B', 'Ba', 'Be', 'Bh',
-    'Bi', 'Bk', 'Br', 'C', 'Ca', 'Cd', 'Ce', 'Cf', 'Cl', 'Cm', 'Co', 'Cr',
-    'Cs', 'Cu', 'Db', 'Ds', 'Dy', 'Er', 'Es', 'Eu', 'F', 'Fe', 'Fm', 'Fr',
-    'Ga', 'Gd', 'Ge', 'H', 'He', 'Hf', 'Hg', 'Ho', 'Hs', 'I', 'In', 'Ir',
-    'K', 'Kr', 'La', 'Li', 'Lr', 'Lu', 'Md', 'Mg', 'Mn', 'Mo', 'Mt', 'N',
-    'Na', 'Nb', 'Nd', 'Ne', 'Ni', 'No', 'Np', 'O', 'Os', 'P', 'Pa', 'Pb',
-    'Pd', 'Pm', 'Po', 'Pr', 'Pt', 'Pu', 'Ra', 'Rb', 'Re', 'Rf', 'Rg', 'Rh',
-    'Rn', 'Ru', 'S', 'Sb', 'Sc', 'Se', 'Sg', 'Si', 'Sm', 'Sn', 'Sr', 'Ta',
-    'Tb', 'Tc', 'Te', 'Th', 'Ti', 'Tl', 'Tm', 'U', 'Uub', 'Uuh', 'Uuo',
-    'Uup', 'Uuq', 'Uus', 'Uut', 'V', 'W', 'Xe', 'Y', 'Yb', 'Zn', 'Zr'
-)
-
-
-def get_csv_data(file_name, column, header=True):
-    """Return in a list all data from a given column of a .csv file"""
-
-    data = []
-
-    with open(file_name) as infile:
-        csv_reader = csv.reader(infile, skipinitialspace=True, delimiter=',')
-        if header:
-            next(csv_reader, None)  # skip header row
-        for row in csv_reader:
-            data.append(row[column])
-
-    return data
-
-
-def tokenize_sequence(sequence):
-    """Return a list each of all single and double character tokens."""
-
-    Tokens = namedtuple('Tokens', (['singles', 'doubles']))
-
-    singles = tuple(sequence[i] for i in range(0, len(sequence)))
-    doubles = tuple(sequence[i:i+2] for i in range(0, len(sequence) - 1))
-
-    return Tokens(singles, doubles)
-
-
-def find_matches(sequence, symbols):
-    """Return a dictionary of symbols and indices for all
-    symbols that match an item in the given sequence.
-    """
-
-    return {
-        symbol: index
-        for character in sequence
-        for index, symbol in enumerate(symbols)
-        if symbol.lower() == character.lower()
-    }
-
-
-def groupings(word, token_sizes=(1, 2)):
-    """Return a tuple of all permutations of possible character
-    grouping arrangements of a word.
-
-    token_sizes defines the possible sizes of character groups,
-    and by default allows only singles and pairs.
-    """
-
-    cartesian_products = (
-        product(token_sizes, repeat=r)
-        for r in range(1, len(word) + 1)
-    )
-
-    # include only groupings that represent the correct number of chars
-    groupings = tuple(
-        grouping
-        for grouping in chain.from_iterable(cartesian_products)
-        if sum(grouping) == len(word)
-    )
-
-    return groupings
-
-
-def map_word(word, grouping):
-    """Given a word and a grouping, map the characters of the word
-    to match the distribution defined in the grouping.
-
-    example:
-    >>> map_word('because', (1, 2, 1, 1, 2))
-    ['b', 'ec', 'a', 'u', 'se']
-    """
-
-    word_chars = (c for c in word)
-
-    mapped = []
-    for char_group_size in grouping:
-        char_group = ""
-        for _ in range(char_group_size):
-            char_group += next(word_chars)
-        mapped.append(char_group)
-
-    return tuple(mapped)
-
-
-if __name__ == '__main__':
-    symbols = get_csv_data('elements.csv', 1)
-
-    test_word = 'Osiris'
-
-    tokens = tokenize_sequence(test_word)
-
-    single_matches = find_matches(tokens.singles, symbols)
-    pair_matches = find_matches(tokens.doubles, symbols)
-
-    letter_groupings = groupings(test_word)
-
-    spellings = [map_word(test_word, g) for g in letter_groupings]
-
-    elemental_spellings = [
-        [l.capitalize() for l in spelling]
-        for spelling in spellings
-        if set(c.lower() for c in spelling) <= set(s.lower() for s in symbols)
-    ]
-
-    pprint(tokens)
-    pprint(single_matches)
-    pprint(pair_matches)
-    pprint(list(zip(letter_groupings, spellings)))
-    pprint(elemental_spellings)
diff --git a/tests.py b/tests.py
@@ -1,7 +1,5 @@
-import speller
-import unittest
-
-# TODO: change to py.test syntax
+import elemental_speller as es
+import pytest
 
 ELEMENTS = (
     'Ac', 'Ag', 'Al', 'Am', 'Ar', 'As', 'At', 'Au', 'B', 'Ba', 'Be', 'Bh',
@@ -16,66 +14,30 @@ ELEMENTS = (
     'Uup', 'Uuq', 'Uus', 'Uut', 'V', 'W', 'Xe', 'Y', 'Yb', 'Zn', 'Zr'
 )
 
-class MatchingTest(unittest.TestCase):
-    test_singles = ['B', 'e', 'c', 'a', 'u', 's', 'e']
-    test_pairs = ['Be', 'ec', 'ca', 'au', 'se']
-
-    def test_match_singles(self):
-        matches = speller.find_matches(self.test_singles, ELEMENTS)
-        self.assertEqual(
-            matches,
-            {'S': 86, 'B': 8, 'U': 103, 'C': 15}
-        )
-
-    def test_match_pairs(self):
-        matches = speller.find_matches(self.test_pairs, ELEMENTS)
-        self.assertEqual(
-            matches,
-            {'Au': 7, 'Be': 10, 'Ca': 16, 'Se': 89}
-        )
-
-
-class TokensTest(unittest.TestCase):
-    test_word = "Osiris"
-
-    def test_single_chars(self):
-        tokens = speller.tokenize_sequence(self.test_word)
-        self.assertEqual(tokens.singles, ("O", "s", "i", "r", "i", "s"))
-
-    def test_pair_chars(self):
-        tokens = speller.tokenize_sequence(self.test_word)
-        self.assertEqual(tokens.doubles, ("Os", "si", "ir", "ri", "is"))
 
+def test_verify_data():
+    assert es.ELEMENTS == ELEMENTS
 
-class GroupingTest(unittest.TestCase):
-    word = "that"
 
-    def test_singles_and_pairs(self):
-        expected_maps = ((2, 2), (1, 1, 2), (1, 2, 1), (2, 1, 1), (1, 1, 1, 1))
-        group_maps = speller.groupings(self.word)
-        self.assertEqual(group_maps, expected_maps)
+def test_groupings():
+    assert es._groupings(4, token_sizes=()) == ()
 
+    expected = ((2, 2), (1, 1, 2), (1, 2, 1), (2, 1, 1), (1, 1, 1, 1))
+    assert es._groupings(4, token_sizes=(1, 2)) == expected
 
-class FileTest(unittest.TestCase):
-    file_name = "elements.csv"
+    expected = (
+        (1, 3), (2, 2), (3, 1), (1, 1, 2), (1, 2, 1), (2, 1, 1), (1, 1, 1, 1)
+    )
+    assert es._groupings(4, token_sizes=(1, 2, 3)) == expected
 
-    proper_data = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne',
-                   'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca',
-                   'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn',
-                   'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr',
-                   'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn',
-                   'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd',
-                   'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb',
-                   'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg',
-                   'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn', 'Fr', 'Ra', 'Ac', 'Th',
-                   'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm',
-                   'Md', 'No', 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds',
-                   'Rg', 'Uub', 'Uut', 'Uuq', 'Uup', 'Uuh', 'Uus', 'Uuo']
 
-    def test_file_contains_proper_data(self):
-        data = speller.get_csv_data(self.file_name, 1)
-        self.assertEqual(data, self.proper_data)
+def test_map_word():
+    assert es._map_word('because', (1, 2, 1, 1, 2)) == ('b', 'ec', 'a', 'u', 'se')
+    assert es._map_word('osiris', (1, 3, 2)) == ('o', 'sir', 'is')
 
 
-if __name__ == '__main__':
-    unittest.main(warnings='ignore')
+def test_elemental_spelling():
+    assert es.elemental_spelling('amputation') == [
+        ('Am', 'Pu', 'Ta', 'Ti', 'O', 'N'),
+        ('Am', 'P', 'U', 'Ta', 'Ti', 'O', 'N')
+    ]