commit 72a9c67c17da70ce7e15c3f1e75b6753bddad512
parent b5231a079f27c9cf4061314b67be0b48a38a517a
Author: amin <dev@aminmesbah.com>
Date: Thu, 8 Sep 2016 03:14:08 +0000
Remove all unnecessary functionality. Use py.test.
FossilOrigin-Name: c67d46c068b7619f31297d387b0bf65af828c936f7373fc389828e8d81de5909
Diffstat:
A | Makefile | | | 21 | +++++++++++++++++++++ |
A | dev_requirements.txt | | | 6 | ++++++ |
A | elemental_speller.py | | | 98 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
D | speller.py | | | 131 | ------------------------------------------------------------------------------- |
M | tests.py | | | 78 | ++++++++++++++++++++---------------------------------------------------------- |
5 files changed, 145 insertions(+), 189 deletions(-)
diff --git a/Makefile b/Makefile
@@ -0,0 +1,21 @@
+DATADIR = data/
+DATE = `date +%Y-%m-%d`
+
+init:
+ pip install -r dev_requirements.txt
+
+test:
+ # To run individual tests, use "py.test -k the_test_path"
+ py.test tests.py
+
+lint:
+ flake8 *.py
+
+watch-log:
+ tail -f debug.log
+
+loc:
+ cloc --by-file --include-lang=Python .
+
+todo:
+ grep -FR --ignore-case --binary-file=without-match todo *.py
diff --git a/dev_requirements.txt b/dev_requirements.txt
@@ -0,0 +1,6 @@
+flake8==3.0.4
+mccabe==0.5.2
+py==1.4.31
+pycodestyle==2.0.0
+pyflakes==1.2.3
+pytest==3.0.2
diff --git a/elemental_speller.py b/elemental_speller.py
@@ -0,0 +1,98 @@
+# TODO: add logging
+
+from collections import namedtuple
+from itertools import chain, product
+
+ELEMENTS = (
+ 'Ac', 'Ag', 'Al', 'Am', 'Ar', 'As', 'At', 'Au', 'B', 'Ba', 'Be', 'Bh',
+ 'Bi', 'Bk', 'Br', 'C', 'Ca', 'Cd', 'Ce', 'Cf', 'Cl', 'Cm', 'Co', 'Cr',
+ 'Cs', 'Cu', 'Db', 'Ds', 'Dy', 'Er', 'Es', 'Eu', 'F', 'Fe', 'Fm', 'Fr',
+ 'Ga', 'Gd', 'Ge', 'H', 'He', 'Hf', 'Hg', 'Ho', 'Hs', 'I', 'In', 'Ir',
+ 'K', 'Kr', 'La', 'Li', 'Lr', 'Lu', 'Md', 'Mg', 'Mn', 'Mo', 'Mt', 'N',
+ 'Na', 'Nb', 'Nd', 'Ne', 'Ni', 'No', 'Np', 'O', 'Os', 'P', 'Pa', 'Pb',
+ 'Pd', 'Pm', 'Po', 'Pr', 'Pt', 'Pu', 'Ra', 'Rb', 'Re', 'Rf', 'Rg', 'Rh',
+ 'Rn', 'Ru', 'S', 'Sb', 'Sc', 'Se', 'Sg', 'Si', 'Sm', 'Sn', 'Sr', 'Ta',
+ 'Tb', 'Tc', 'Te', 'Th', 'Ti', 'Tl', 'Tm', 'U', 'Uub', 'Uuh', 'Uuo',
+ 'Uup', 'Uuq', 'Uus', 'Uut', 'V', 'W', 'Xe', 'Y', 'Yb', 'Zn', 'Zr'
+)
+
+
+def elemental_spelling(word, symbols=ELEMENTS):
+ """Given a word and a sequence of symbols (tokens),
+ return a list of any possible ways to spell that word
+ with those symbols.
+
+ Example:
+ >>> elemental_spelling('amputation')
+ [(('Am', 'Pu', 'Ta', 'Ti', 'O', 'N'), ('Am', 'P', 'U', 'Ta', 'Ti', 'O', 'N')]
+ """
+ letter_groupings = _groupings(len(word))
+
+ spellings = [_map_word(word, grouping) for grouping in letter_groupings]
+
+ elemental_spellings = [
+ tuple(token.capitalize() for token in spelling)
+ for spelling in spellings
+ # set operation: set of chars in spelling is subset of set of symbols
+ if set(s.lower() for s in spelling) <= set(s.lower() for s in symbols)
+ ]
+
+ return elemental_spellings
+
+
+def _groupings(word_length, token_sizes=(1, 2, 3)):
+ """Return a tuple of all character groupings for a word
+ of a given length.
+
+ A character grouping is a tuple representing the distribution
+ of characters in a tokenized word.
+
+ The word 'canary', if mapped to the grouping (1, 3, 2), would
+ be broken down into ['c', 'ana', 'ry'].
+
+ token_sizes defines the possible sizes of character groups,
+ and by default allows only singles, pairs, and triplets.
+
+ Example:
+ >>> _groupings(4, token_sizes=(1, 2))
+ ((2, 2), (1, 1, 2), (1, 2, 1), (2, 1, 1), (1, 1, 1, 1))
+ """
+
+ cartesian_products = (
+ product(token_sizes, repeat=r)
+ for r in range(1, word_length + 1)
+ )
+
+ # include only groupings that represent the correct number of chars
+ groupings = tuple(
+ grouping
+ for grouping in chain.from_iterable(cartesian_products)
+ if sum(grouping) == word_length
+ )
+
+ return groupings
+
+
+def _map_word(word, grouping):
+ """Return a tuple of tokens: word mapped to a grouping.
+
+ Example:
+ >>> _map_word('because', (1, 2, 1, 1, 2))
+ ('b', 'ec', 'a', 'u', 'se')
+ """
+
+ word_chars = (c for c in word)
+
+ mapped = []
+ for char_group_size in grouping:
+ char_group = ""
+ for _ in range(char_group_size):
+ char_group += next(word_chars)
+ mapped.append(char_group)
+
+ return tuple(mapped)
+
+
+if __name__ == '__main__':
+ test_word = 'Mockery'
+ print('{}:\n{}'.format(test_word, elemental_spelling(test_word)))
diff --git a/speller.py b/speller.py
@@ -1,131 +0,0 @@
-# TODO:
-# - eliminate unnecessary functions
-# - simplify
-# - use consistent terminology
-import csv
-from collections import namedtuple
-from itertools import chain, product
-from pprint import pprint
-
-ELEMENTS = (
- 'Ac', 'Ag', 'Al', 'Am', 'Ar', 'As', 'At', 'Au', 'B', 'Ba', 'Be', 'Bh',
- 'Bi', 'Bk', 'Br', 'C', 'Ca', 'Cd', 'Ce', 'Cf', 'Cl', 'Cm', 'Co', 'Cr',
- 'Cs', 'Cu', 'Db', 'Ds', 'Dy', 'Er', 'Es', 'Eu', 'F', 'Fe', 'Fm', 'Fr',
- 'Ga', 'Gd', 'Ge', 'H', 'He', 'Hf', 'Hg', 'Ho', 'Hs', 'I', 'In', 'Ir',
- 'K', 'Kr', 'La', 'Li', 'Lr', 'Lu', 'Md', 'Mg', 'Mn', 'Mo', 'Mt', 'N',
- 'Na', 'Nb', 'Nd', 'Ne', 'Ni', 'No', 'Np', 'O', 'Os', 'P', 'Pa', 'Pb',
- 'Pd', 'Pm', 'Po', 'Pr', 'Pt', 'Pu', 'Ra', 'Rb', 'Re', 'Rf', 'Rg', 'Rh',
- 'Rn', 'Ru', 'S', 'Sb', 'Sc', 'Se', 'Sg', 'Si', 'Sm', 'Sn', 'Sr', 'Ta',
- 'Tb', 'Tc', 'Te', 'Th', 'Ti', 'Tl', 'Tm', 'U', 'Uub', 'Uuh', 'Uuo',
- 'Uup', 'Uuq', 'Uus', 'Uut', 'V', 'W', 'Xe', 'Y', 'Yb', 'Zn', 'Zr'
-)
-
-
-def get_csv_data(file_name, column, header=True):
- """Return in a list all data from a given column of a .csv file"""
-
- data = []
-
- with open(file_name) as infile:
- csv_reader = csv.reader(infile, skipinitialspace=True, delimiter=',')
- if header:
- next(csv_reader, None) # skip header row
- for row in csv_reader:
- data.append(row[column])
-
- return data
-
-
-def tokenize_sequence(sequence):
- """Return a list each of all single and double character tokens."""
-
- Tokens = namedtuple('Tokens', (['singles', 'doubles']))
-
- singles = tuple(sequence[i] for i in range(0, len(sequence)))
- doubles = tuple(sequence[i:i+2] for i in range(0, len(sequence) - 1))
-
- return Tokens(singles, doubles)
-
-
-def find_matches(sequence, symbols):
- """Return a dictionary of symbols and indices for all
- symbols that match an item in the given sequence.
- """
-
- return {
- symbol: index
- for character in sequence
- for index, symbol in enumerate(symbols)
- if symbol.lower() == character.lower()
- }
-
-
-def groupings(word, token_sizes=(1, 2)):
- """Return a tuple of all permutations of possible character
- grouping arrangements of a word.
-
- token_sizes defines the possible sizes of character groups,
- and by default allows only singles and pairs.
- """
-
- cartesian_products = (
- product(token_sizes, repeat=r)
- for r in range(1, len(word) + 1)
- )
-
- # include only groupings that represent the correct number of chars
- groupings = tuple(
- grouping
- for grouping in chain.from_iterable(cartesian_products)
- if sum(grouping) == len(word)
- )
-
- return groupings
-
-
-def map_word(word, grouping):
- """Given a word and a grouping, map the characters of the word
- to match the distribution defined in the grouping.
-
- example:
- >>> map_word('because', (1, 2, 1, 1, 2))
- ['b', 'ec', 'a', 'u', 'se']
- """
-
- word_chars = (c for c in word)
-
- mapped = []
- for char_group_size in grouping:
- char_group = ""
- for _ in range(char_group_size):
- char_group += next(word_chars)
- mapped.append(char_group)
-
- return tuple(mapped)
-
-
-if __name__ == '__main__':
- symbols = get_csv_data('elements.csv', 1)
-
- test_word = 'Osiris'
-
- tokens = tokenize_sequence(test_word)
-
- single_matches = find_matches(tokens.singles, symbols)
- pair_matches = find_matches(tokens.doubles, symbols)
-
- letter_groupings = groupings(test_word)
-
- spellings = [map_word(test_word, g) for g in letter_groupings]
-
- elemental_spellings = [
- [l.capitalize() for l in spelling]
- for spelling in spellings
- if set(c.lower() for c in spelling) <= set(s.lower() for s in symbols)
- ]
-
- pprint(tokens)
- pprint(single_matches)
- pprint(pair_matches)
- pprint(list(zip(letter_groupings, spellings)))
- pprint(elemental_spellings)
diff --git a/tests.py b/tests.py
@@ -1,7 +1,5 @@
-import speller
-import unittest
-
-# TODO: change to py.test syntax
+import elemental_speller as es
+import pytest
ELEMENTS = (
'Ac', 'Ag', 'Al', 'Am', 'Ar', 'As', 'At', 'Au', 'B', 'Ba', 'Be', 'Bh',
@@ -16,66 +14,30 @@ ELEMENTS = (
'Uup', 'Uuq', 'Uus', 'Uut', 'V', 'W', 'Xe', 'Y', 'Yb', 'Zn', 'Zr'
)
-class MatchingTest(unittest.TestCase):
- test_singles = ['B', 'e', 'c', 'a', 'u', 's', 'e']
- test_pairs = ['Be', 'ec', 'ca', 'au', 'se']
-
- def test_match_singles(self):
- matches = speller.find_matches(self.test_singles, ELEMENTS)
- self.assertEqual(
- matches,
- {'S': 86, 'B': 8, 'U': 103, 'C': 15}
- )
-
- def test_match_pairs(self):
- matches = speller.find_matches(self.test_pairs, ELEMENTS)
- self.assertEqual(
- matches,
- {'Au': 7, 'Be': 10, 'Ca': 16, 'Se': 89}
- )
-
-
-class TokensTest(unittest.TestCase):
- test_word = "Osiris"
-
- def test_single_chars(self):
- tokens = speller.tokenize_sequence(self.test_word)
- self.assertEqual(tokens.singles, ("O", "s", "i", "r", "i", "s"))
-
- def test_pair_chars(self):
- tokens = speller.tokenize_sequence(self.test_word)
- self.assertEqual(tokens.doubles, ("Os", "si", "ir", "ri", "is"))
+def test_verify_data():
+ assert es.ELEMENTS == ELEMENTS
-class GroupingTest(unittest.TestCase):
- word = "that"
- def test_singles_and_pairs(self):
- expected_maps = ((2, 2), (1, 1, 2), (1, 2, 1), (2, 1, 1), (1, 1, 1, 1))
- group_maps = speller.groupings(self.word)
- self.assertEqual(group_maps, expected_maps)
+def test_groupings():
+ assert es._groupings(4, token_sizes=()) == ()
+ expected = ((2, 2), (1, 1, 2), (1, 2, 1), (2, 1, 1), (1, 1, 1, 1))
+ assert es._groupings(4, token_sizes=(1, 2)) == expected
-class FileTest(unittest.TestCase):
- file_name = "elements.csv"
+ expected = (
+ (1, 3), (2, 2), (3, 1), (1, 1, 2), (1, 2, 1), (2, 1, 1), (1, 1, 1, 1)
+ )
+ assert es._groupings(4, token_sizes=(1, 2, 3)) == expected
- proper_data = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne',
- 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca',
- 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn',
- 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr',
- 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn',
- 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd',
- 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb',
- 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg',
- 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn', 'Fr', 'Ra', 'Ac', 'Th',
- 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm',
- 'Md', 'No', 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds',
- 'Rg', 'Uub', 'Uut', 'Uuq', 'Uup', 'Uuh', 'Uus', 'Uuo']
- def test_file_contains_proper_data(self):
- data = speller.get_csv_data(self.file_name, 1)
- self.assertEqual(data, self.proper_data)
+def test_map_word():
+ assert es._map_word('because', (1, 2, 1, 1, 2)) == ('b', 'ec', 'a', 'u', 'se')
+ assert es._map_word('osiris', (1, 3, 2)) == ('o', 'sir', 'is')
-if __name__ == '__main__':
- unittest.main(warnings='ignore')
+def test_elemental_spelling():
+ assert es.elemental_spelling('amputation') == [
+ ('Am', 'Pu', 'Ta', 'Ti', 'O', 'N'),
+ ('Am', 'P', 'U', 'Ta', 'Ti', 'O', 'N')
+ ]