stoichiograph

Spell words with elemental symbols from the periodic table.
git clone git://git.amin.space/stoichiograph.git
Log | Files | Refs | LICENSE

commit 0890827b23182eda96881b5e2401c9650fb9d2cd
parent a5d52a852a0ff16cf915ca449e995e16ce111da0
Author: amin <dev@aminmesbah.com>
Date:   Wed, 22 Feb 2017 07:40:36 +0000

Replace old spelling method with graph method.

The new graph code is immensely more efficient!

FossilOrigin-Name: 855d613c68ca66fd8865d9f881eede326adef28f26528d257ed0e450a394e6c4
Diffstat:
Ddag.py | 126-------------------------------------------------------------------------------
Mspeller.py | 172++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
Mstoichiograph.py | 9++-------
Mtests.py | 26++------------------------
4 files changed, 105 insertions(+), 228 deletions(-)

diff --git a/dag.py b/dag.py @@ -1,126 +0,0 @@ -from collections import defaultdict, namedtuple - -ELEMENTS = { - 'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', - 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', - 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', - 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', - 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', - 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', - 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn', 'Fr', 'Ra', 'Ac', - 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', - 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'Nh', 'Fl', - 'Mc', 'Lv', 'Ts', 'Og' -} - -# A single node of the graph. -Node = namedtuple('Node', ['value', 'position']) - - -class Graph(): - """A directed acyclic graph that stores all possible - elemental spellings of a word. - """ - def __init__(self): - self._parents_of = defaultdict(set) - self._children_of = defaultdict(set) - - def firsts(self): - """Return nodes with no parents.""" - return self._children_of[None] - - def lasts(self): - """Return nodes with no children.""" - return self._parents_of[None] - - def add_edge(self, parent, child): - """Add a parent-child to child relatonship to the graph. - None is ok as a key, but not a value. - """ - if parent is not None: - self._parents_of[child].add(parent) - if child is not None: - self._children_of[parent].add(child) - - def edges(self): - """Return a list of all parent-child relationships.""" - return [ - (parent, child) - for parent in self._children_of - for child in self._children_of[parent] - ] - - def export(self): - """Print a string to stdout that can be interpreted by - Graphviz. - """ - print('digraph G {') - for (parent, child) in self.edges(): - print('\t{} -> {}'.format(parent.value, child.value)) - print('}') - - -def find_all_paths(graph, start, end, path=[]): - """Return a list of all paths through the graph from start - to end. - Based on https://www.python.org/doc/essays/graphs/ - """ - path = path + [start] - if start == end: - return [path] - if start not in graph.keys(): - return [] - paths = [] - for node in graph[start]: - if node not in path: - newpaths = find_all_paths(graph, node, end, path) - for newpath in newpaths: - paths.append(tuple(newpath)) - return paths - - -def build_graph(word, graph, symbols=ELEMENTS): - """Given a word and a graph, recursively find all single and - double-character tokens in the word and add them to the graph. - """ - - def segments(word, position=0, previous_root=None): - if word == '': - graph.add_edge(previous_root, None) - return - - single_root = Node(word[0], position) - if single_root.value.capitalize() in symbols: - graph.add_edge(previous_root, single_root) - - if word not in processed: - single_stem = word[1:] - segments(single_stem, position + 1, previous_root=single_root) - - if len(word) >= 2: - double_root = Node(word[0:2], position) - if double_root.value.capitalize() in symbols: - graph.add_edge(previous_root, double_root) - - if word not in processed: - double_stem = word[2:] - segments(double_stem, position + 2, previous_root=double_root) - processed.add(word) - - processed = set() - segments(word) - - -if __name__ == '__main__': - from pprint import pprint - w = 'inconspicuous' - g = Graph() - build_graph(w, g) - - spellings = list() - for first in g._children_of[None]: - for last in g._parents_of[None]: - for path in find_all_paths(g._children_of, first, last): - spellings.append(tuple(node.value for node in path)) - - pprint(spellings) diff --git a/speller.py b/speller.py @@ -1,17 +1,13 @@ +from collections import defaultdict, namedtuple from functools import lru_cache -from itertools import chain, product import logging -import dag -# TODO(amin): Profile and optimize -# TODO(amin): Add performance reporting to log -# TODO(amin): Use recursion to save time with long words that can't be spelled. # TODO(amin): Convert symbol tuple to element name or atomic number tuple log = logging.getLogger(__name__) log.addHandler(logging.NullHandler()) -ELEMENTS = ( +ELEMENTS = { 'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', @@ -22,13 +18,13 @@ ELEMENTS = ( 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'Nh', 'Fl', 'Mc', 'Lv', 'Ts', 'Og' -) +} # TODO(amin): Use optional caching/memoization to improve performance # TODO(amin): Support appostrophies # TODO(amin): Add option to require no repeated symbols -def spell(word, use_graph=False, symbols=ELEMENTS): +def spell(word, symbols=ELEMENTS): """Return a list of any possible ways to spell a word with a given set of symbols. @@ -38,88 +34,122 @@ def spell(word, use_graph=False, symbols=ELEMENTS): """ log.info('Word: {}'.format(word)) - if use_graph: - log.debug('Using graph speller') - g = dag.Graph() - dag.build_graph(word, g) + log.debug('Using graph speller') + g = Graph() + build_graph(word, g) - spellings = list() - for first in g.firsts(): - for last in g.lasts(): - for path in dag.find_all_paths(g._children_of, first, last): - spellings.append(tuple(node.value for node in path)) + spellings = list() + for first in g.firsts(): + for last in g.lasts(): + for path in find_all_paths(g._children_of, first, last): + spellings.append(tuple(node.value for node in path)) - else: - groupings = generate_groupings(len(word)) - spellings = [map_word(word, grouping) for grouping in groupings] - - elemental_spellings = [ + elemental_spellings = sorted([ tuple(token.capitalize() for token in spelling) for spelling in spellings - # set operation: set of chars in spelling is subset of set of symbols - if set(s.lower() for s in spelling) <= set(s.lower() for s in symbols) - ] + ], reverse=True) log.info('Spellings: {}'.format(elemental_spellings)) return elemental_spellings -@lru_cache(maxsize=None) -def generate_groupings(word_length, batch_sizes=(1, 2)): - """Return all groupings for a word of a given length. +# A single node of the graph. +Node = namedtuple('Node', ['value', 'position']) - A grouping is a tuple representing the distribution of - characters in a word. By default, characters can be in - batches of 1 or 2. - Example: - >>> generate_groupings(4) - ((2, 2), (1, 1, 2), (1, 2, 1), (2, 1, 1), (1, 1, 1, 1)) +class Graph(): + """A directed acyclic graph that stores all possible + elemental spellings of a word. + """ + def __init__(self): + self._parents_of = defaultdict(set) + self._children_of = defaultdict(set) + + def firsts(self): + """Return nodes with no parents.""" + return self._children_of[None] + + def lasts(self): + """Return nodes with no children.""" + return self._parents_of[None] + + def add_edge(self, parent, child): + """Add a parent-child relatonship to the graph. + None is ok as a key, but not a value. + """ + if parent is not None: + self._parents_of[child].add(parent) + if child is not None: + self._children_of[parent].add(child) + + def edges(self): + """Return a list of all parent-child relationships.""" + return [ + (parent, child) + for parent in self._children_of + for child in self._children_of[parent] + ] + + def export(self): + """Print a string to stdout that can be interpreted by + Graphviz. + """ + print('digraph G {') + for (parent, child) in self.edges(): + a = None if parent is None else parent.value + b = None if child is None else child.value + print('\t{} -> {}'.format(a, b)) + print('}') + + +def find_all_paths(graph, start, end, path=[]): + """Return a list of all paths through the graph from start + to end. + Based on https://www.python.org/doc/essays/graphs/ + """ + path = path + [start] + if start == end: + return [path] + if start not in graph.keys(): + return [] + paths = [] + for node in graph[start]: + if node not in path: + newpaths = find_all_paths(graph, node, end, path) + for newpath in newpaths: + paths.append(tuple(newpath)) + return paths + + +def build_graph(word, graph, symbols=ELEMENTS): + """Given a word and a graph, recursively find all single and + double-character tokens in the word and add them to the graph. """ - cartesian_products = ( - product(batch_sizes, repeat=r) - for r in range(1, word_length + 1) - ) - - # include only groupings that represent the correct number of chars - groupings = tuple( - grouping - for grouping in chain.from_iterable(cartesian_products) - if sum(grouping) == word_length - ) - - log.debug('Groupings: {}'.format(groupings)) - log.debug(generate_groupings.cache_info()) - - return groupings - -def map_word(word, grouping): - """Return a word mapped to a grouping. + def segments(word, position=0, previous_root=None): + if word == '': + graph.add_edge(previous_root, None) + return - Example: - >>> map_word('because', (1, 2, 1, 1, 2)) - ('b', 'ec', 'a', 'u', 'se') - """ - if len(word) != sum(grouping): - raise ValueError( - 'Word length ({}) != sum of elements in grouping ({})'.format( - len(word), sum(grouping)) - ) + single_root = Node(word[0], position) + if single_root.value.capitalize() in symbols: + graph.add_edge(previous_root, single_root) - chars = (c for c in word) + if word not in processed: + segments(word[1:], position + 1, previous_root=single_root) - mapped = [] - for batch_size in grouping: - batch = "" - for _ in range(batch_size): - batch += next(chars) - mapped.append(batch) + if len(word) >= 2: + double_root = Node(word[0:2], position) + if double_root.value.capitalize() in symbols: + graph.add_edge(previous_root, double_root) - log.debug('Grouping: {}. Mapped word: {}'.format(grouping, mapped)) + if word not in processed: + segments(word[2:], position + 2, previous_root=double_root) + processed.add(word) - return tuple(mapped) + processed = set() + segments(word) if __name__ == '__main__': diff --git a/stoichiograph.py b/stoichiograph.py @@ -61,11 +61,6 @@ def get_args(): '-V', '--version', action='store_true', help='print version info and exit' ) - # TODO(amin): Remove this - parser.add_argument( - '--use-graph', action='store_true', - help='use the graph-based speller' - ) return parser.parse_args() @@ -121,9 +116,9 @@ def main(): for word in words: if TUPLES: - spellings = speller.spell(word, use_graph=args.use_graph) + spellings = speller.spell(word) else: - spellings = [''.join(s) for s in speller.spell(word, use_graph=args.use_graph)] + spellings = [''.join(s) for s in speller.spell(word)] if spellings: spellable[word] = spellings diff --git a/tests.py b/tests.py @@ -1,7 +1,6 @@ -import pytest import speller -ELEMENTS = ( +ELEMENTS = { 'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', @@ -12,34 +11,13 @@ ELEMENTS = ( 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'Nh', 'Fl', 'Mc', 'Lv', 'Ts', 'Og' -) +} def test_verify_data(): assert speller.ELEMENTS == ELEMENTS -def test_groupings(): - assert speller.generate_groupings(4, batch_sizes=()) == () - - assert speller.generate_groupings(4, batch_sizes=(1, 2)) == ( - (2, 2), (1, 1, 2), (1, 2, 1), (2, 1, 1), (1, 1, 1, 1) - ) - - assert speller.generate_groupings(4, batch_sizes=(1, 2, 3)) == ( - (1, 3), (2, 2), (3, 1), (1, 1, 2), (1, 2, 1), (2, 1, 1), (1, 1, 1, 1) - ) - - -def test_map_word(): - assert speller.map_word('because', (1, 2, 1, 1, 2)) == ('b', 'ec', 'a', 'u', 'se') - assert speller.map_word('osiris', (1, 3, 2)) == ('o', 'sir', 'is') - - with pytest.raises(ValueError): - speller.map_word('toolong', (2, 1)) - speller.map_word('short', (2, 2, 2)) - - def test_elemental_spelling(): assert speller.spell('amputation') == [ ('Am', 'Pu', 'Ta', 'Ti', 'O', 'N'),