commit b5231a079f27c9cf4061314b67be0b48a38a517a
parent a0f9ebae95c7485f207ff67d3603d90133f0fd33
Author: amin <dev@aminmesbah.com>
Date: Wed, 7 Sep 2016 23:31:13 +0000
Add function to map a word to a grouping scheme. Simplify control flow.
FossilOrigin-Name: 0ede8311bbcec66575422d99bb203bd6a322e729876c6cc38085cba7b87f5d09
Diffstat:
M | speller.py | | | 142 | ++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------- |
M | tests.py | | | 37 | ++++++++++++++++++++++++++++--------- |
2 files changed, 121 insertions(+), 58 deletions(-)
diff --git a/speller.py b/speller.py
@@ -1,21 +1,35 @@
# TODO:
-# test that all letters in word are present in some element
-# generate group_maps only for the exact number of chars in word
-#
-
+# - eliminate unnecessary functions
+# - simplify
+# - use consistent terminology
+import csv
from collections import namedtuple
from itertools import chain, product
-import csv
-import sys
-
-
-def get_csv_data(file_name, column):
+from pprint import pprint
+
+ELEMENTS = (
+ 'Ac', 'Ag', 'Al', 'Am', 'Ar', 'As', 'At', 'Au', 'B', 'Ba', 'Be', 'Bh',
+ 'Bi', 'Bk', 'Br', 'C', 'Ca', 'Cd', 'Ce', 'Cf', 'Cl', 'Cm', 'Co', 'Cr',
+ 'Cs', 'Cu', 'Db', 'Ds', 'Dy', 'Er', 'Es', 'Eu', 'F', 'Fe', 'Fm', 'Fr',
+ 'Ga', 'Gd', 'Ge', 'H', 'He', 'Hf', 'Hg', 'Ho', 'Hs', 'I', 'In', 'Ir',
+ 'K', 'Kr', 'La', 'Li', 'Lr', 'Lu', 'Md', 'Mg', 'Mn', 'Mo', 'Mt', 'N',
+ 'Na', 'Nb', 'Nd', 'Ne', 'Ni', 'No', 'Np', 'O', 'Os', 'P', 'Pa', 'Pb',
+ 'Pd', 'Pm', 'Po', 'Pr', 'Pt', 'Pu', 'Ra', 'Rb', 'Re', 'Rf', 'Rg', 'Rh',
+ 'Rn', 'Ru', 'S', 'Sb', 'Sc', 'Se', 'Sg', 'Si', 'Sm', 'Sn', 'Sr', 'Ta',
+ 'Tb', 'Tc', 'Te', 'Th', 'Ti', 'Tl', 'Tm', 'U', 'Uub', 'Uuh', 'Uuo',
+ 'Uup', 'Uuq', 'Uus', 'Uut', 'V', 'W', 'Xe', 'Y', 'Yb', 'Zn', 'Zr'
+)
+
+
+def get_csv_data(file_name, column, header=True):
"""Return in a list all data from a given column of a .csv file"""
+
data = []
with open(file_name) as infile:
csv_reader = csv.reader(infile, skipinitialspace=True, delimiter=',')
- next(csv_reader, None)
+ if header:
+ next(csv_reader, None) # skip header row
for row in csv_reader:
data.append(row[column])
@@ -24,64 +38,94 @@ def get_csv_data(file_name, column):
def tokenize_sequence(sequence):
"""Return a list each of all single and double character tokens."""
- t = namedtuple('Tokens', (['single', 'pair']))
- single = [sequence[i:i+1] for i in range(0, len(sequence))]
- pair = [sequence[i:i+2] for i in range(0, len(sequence) - 1)]
- tokens = t(single, pair)
+ Tokens = namedtuple('Tokens', (['singles', 'doubles']))
+
+ singles = tuple(sequence[i] for i in range(0, len(sequence)))
+ doubles = tuple(sequence[i:i+2] for i in range(0, len(sequence) - 1))
- return tokens
+ return Tokens(singles, doubles)
def find_matches(sequence, symbols):
- """Return a list of all element symbols matching
- an item in the given sequence.
+ """Return a dictionary of symbols and indices for all
+ symbols that match an item in the given sequence.
"""
- matches = []
- indices = []
- lower_symbols = [i.lower() for i in symbols]
- lower_sequence = [i.lower() for i in sequence]
- for i in lower_sequence:
- matches += (x for x in lower_symbols if x == i)
- indices += (lower_symbols.index(x) for x in lower_symbols if x == i)
+ return {
+ symbol: index
+ for character in sequence
+ for index, symbol in enumerate(symbols)
+ if symbol.lower() == character.lower()
+ }
- return matches
+def groupings(word, token_sizes=(1, 2)):
+ """Return a tuple of all permutations of possible character
+ grouping arrangements of a word.
-def groupings(word, group_sizes = [1,2]):
- """Return a list of all permutations of possible character grouping
- arrangements of a word. group_sizes defines the possible sizes of
- character groups, and by default allows only singles and pairs.
+ token_sizes defines the possible sizes of character groups,
+ and by default allows only singles and pairs.
"""
- group_maps = []
- length = len(word)
- cartesian_product = (product(group_sizes, repeat=r)
- for r in range(1, length + 1))
- products = chain.from_iterable(cartesian_product)
- # include only products that represent the correct number of chars
- for p in products:
- if sum(p) == length:
- p = [tuple(x for x in p)]
- for x in p:
- if x not in group_maps:
- group_maps.append(x)
+ cartesian_products = (
+ product(token_sizes, repeat=r)
+ for r in range(1, len(word) + 1)
+ )
+
+ # include only groupings that represent the correct number of chars
+ groupings = tuple(
+ grouping
+ for grouping in chain.from_iterable(cartesian_products)
+ if sum(grouping) == len(word)
+ )
- return group_maps
+ return groupings
-def main():
+def map_word(word, grouping):
+ """Given a word and a grouping, map the characters of the word
+ to match the distribution defined in the grouping.
+
+ example:
+ >>> map_word('because', (1, 2, 1, 1, 2))
+ ['b', 'ec', 'a', 'u', 'se']
+ """
+
+ word_chars = (c for c in word)
+
+ mapped = []
+ for char_group_size in grouping:
+ char_group = ""
+ for _ in range(char_group_size):
+ char_group += next(word_chars)
+ mapped.append(char_group)
+
+ return tuple(mapped)
+
+
+if __name__ == '__main__':
symbols = get_csv_data('elements.csv', 1)
- test_word = "Because"
+ test_word = 'Osiris'
tokens = tokenize_sequence(test_word)
- single_matches = find_matches(tokens.single, symbols)
- pair_matches = find_matches(tokens.pair, symbols)
- print(single_matches, pair_matches)
+ single_matches = find_matches(tokens.singles, symbols)
+ pair_matches = find_matches(tokens.doubles, symbols)
+ letter_groupings = groupings(test_word)
-if __name__ == '__main__':
- main()
+ spellings = [map_word(test_word, g) for g in letter_groupings]
+
+ elemental_spellings = [
+ [l.capitalize() for l in spelling]
+ for spelling in spellings
+ if set(c.lower() for c in spelling) <= set(s.lower() for s in symbols)
+ ]
+
+ pprint(tokens)
+ pprint(single_matches)
+ pprint(pair_matches)
+ pprint(list(zip(letter_groupings, spellings)))
+ pprint(elemental_spellings)
diff --git a/tests.py b/tests.py
@@ -1,20 +1,38 @@
import speller
import unittest
+# TODO: change to py.test syntax
+
+ELEMENTS = (
+ 'Ac', 'Ag', 'Al', 'Am', 'Ar', 'As', 'At', 'Au', 'B', 'Ba', 'Be', 'Bh',
+ 'Bi', 'Bk', 'Br', 'C', 'Ca', 'Cd', 'Ce', 'Cf', 'Cl', 'Cm', 'Co', 'Cr',
+ 'Cs', 'Cu', 'Db', 'Ds', 'Dy', 'Er', 'Es', 'Eu', 'F', 'Fe', 'Fm', 'Fr',
+ 'Ga', 'Gd', 'Ge', 'H', 'He', 'Hf', 'Hg', 'Ho', 'Hs', 'I', 'In', 'Ir',
+ 'K', 'Kr', 'La', 'Li', 'Lr', 'Lu', 'Md', 'Mg', 'Mn', 'Mo', 'Mt', 'N',
+ 'Na', 'Nb', 'Nd', 'Ne', 'Ni', 'No', 'Np', 'O', 'Os', 'P', 'Pa', 'Pb',
+ 'Pd', 'Pm', 'Po', 'Pr', 'Pt', 'Pu', 'Ra', 'Rb', 'Re', 'Rf', 'Rg', 'Rh',
+ 'Rn', 'Ru', 'S', 'Sb', 'Sc', 'Se', 'Sg', 'Si', 'Sm', 'Sn', 'Sr', 'Ta',
+ 'Tb', 'Tc', 'Te', 'Th', 'Ti', 'Tl', 'Tm', 'U', 'Uub', 'Uuh', 'Uuo',
+ 'Uup', 'Uuq', 'Uus', 'Uut', 'V', 'W', 'Xe', 'Y', 'Yb', 'Zn', 'Zr'
+)
class MatchingTest(unittest.TestCase):
test_singles = ['B', 'e', 'c', 'a', 'u', 's', 'e']
test_pairs = ['Be', 'ec', 'ca', 'au', 'se']
- #TODO: change this so it never fails
- symbols = speller.get_csv_data('elements.csv', 1)
def test_match_singles(self):
- matches = speller.find_matches(self.test_singles, self.symbols)
- self.assertEqual(matches, ['b', 'c', 'u', 's'])
+ matches = speller.find_matches(self.test_singles, ELEMENTS)
+ self.assertEqual(
+ matches,
+ {'S': 86, 'B': 8, 'U': 103, 'C': 15}
+ )
def test_match_pairs(self):
- matches = speller.find_matches(self.test_pairs, self.symbols)
- self.assertEqual(matches, ['be', 'ca', 'au', 'se'])
+ matches = speller.find_matches(self.test_pairs, ELEMENTS)
+ self.assertEqual(
+ matches,
+ {'Au': 7, 'Be': 10, 'Ca': 16, 'Se': 89}
+ )
class TokensTest(unittest.TestCase):
@@ -22,24 +40,25 @@ class TokensTest(unittest.TestCase):
def test_single_chars(self):
tokens = speller.tokenize_sequence(self.test_word)
- self.assertEqual(tokens.single, ["O", "s", "i", "r", "i", "s"])
+ self.assertEqual(tokens.singles, ("O", "s", "i", "r", "i", "s"))
def test_pair_chars(self):
tokens = speller.tokenize_sequence(self.test_word)
- self.assertEqual(tokens.pair, ["Os", "si", "ir", "ri", "is"])
+ self.assertEqual(tokens.doubles, ("Os", "si", "ir", "ri", "is"))
class GroupingTest(unittest.TestCase):
word = "that"
def test_singles_and_pairs(self):
- expected_maps = [(2, 2), (1, 1, 2), (1, 2, 1), (2, 1, 1), (1,1,1,1)]
+ expected_maps = ((2, 2), (1, 1, 2), (1, 2, 1), (2, 1, 1), (1, 1, 1, 1))
group_maps = speller.groupings(self.word)
self.assertEqual(group_maps, expected_maps)
class FileTest(unittest.TestCase):
file_name = "elements.csv"
+
proper_data = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne',
'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca',
'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn',