commit 0890827b23182eda96881b5e2401c9650fb9d2cd
parent a5d52a852a0ff16cf915ca449e995e16ce111da0
Author: amin <dev@aminmesbah.com>
Date:   Wed, 22 Feb 2017 07:40:36 +0000
Replace old spelling method with graph method.
The new graph code is immensely more efficient!
FossilOrigin-Name: 855d613c68ca66fd8865d9f881eede326adef28f26528d257ed0e450a394e6c4
Diffstat:
| D | dag.py | | | 126 | ------------------------------------------------------------------------------- | 
| M | speller.py | | | 172 | ++++++++++++++++++++++++++++++++++++++++++++++--------------------------------- | 
| M | stoichiograph.py | | | 9 | ++------- | 
| M | tests.py | | | 26 | ++------------------------ | 
4 files changed, 105 insertions(+), 228 deletions(-)
diff --git a/dag.py b/dag.py
@@ -1,126 +0,0 @@
-from collections import defaultdict, namedtuple
-
-ELEMENTS = {
-    'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al',
-    'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe',
-    'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y',
-    'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb',
-    'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd',
-    'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir',
-    'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn', 'Fr', 'Ra', 'Ac',
-    'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No',
-    'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'Nh', 'Fl',
-    'Mc', 'Lv', 'Ts', 'Og'
-}
-
-# A single node of the graph.
-Node = namedtuple('Node', ['value', 'position'])
-
-
-class Graph():
-    """A directed acyclic graph that stores all possible
-    elemental spellings of a word.
-    """
-    def __init__(self):
-        self._parents_of = defaultdict(set)
-        self._children_of = defaultdict(set)
-
-    def firsts(self):
-        """Return nodes with no parents."""
-        return self._children_of[None]
-
-    def lasts(self):
-        """Return nodes with no children."""
-        return self._parents_of[None]
-
-    def add_edge(self, parent, child):
-        """Add a parent-child to child relatonship to the graph.
-        None is ok as a key, but not a value.
-        """
-        if parent is not None:
-            self._parents_of[child].add(parent)
-        if child is not None:
-            self._children_of[parent].add(child)
-
-    def edges(self):
-        """Return a list of all parent-child relationships."""
-        return [
-            (parent, child)
-            for parent in self._children_of
-            for child in self._children_of[parent]
-        ]
-
-    def export(self):
-        """Print a string to stdout that can be interpreted by
-        Graphviz.
-        """
-        print('digraph G {')
-        for (parent, child) in self.edges():
-            print('\t{} -> {}'.format(parent.value, child.value))
-        print('}')
-
-
-def find_all_paths(graph, start, end, path=[]):
-    """Return a list of all paths through the graph from start
-    to end.
-    Based on https://www.python.org/doc/essays/graphs/
-    """
-    path = path + [start]
-    if start == end:
-        return [path]
-    if start not in graph.keys():
-        return []
-    paths = []
-    for node in graph[start]:
-        if node not in path:
-            newpaths = find_all_paths(graph, node, end, path)
-            for newpath in newpaths:
-                paths.append(tuple(newpath))
-    return paths
-
-
-def build_graph(word, graph, symbols=ELEMENTS):
-    """Given a word and a graph, recursively find all single and
-    double-character tokens in the word and add them to the graph.
-    """
-
-    def segments(word, position=0, previous_root=None):
-        if word == '':
-            graph.add_edge(previous_root, None)
-            return
-
-        single_root = Node(word[0], position)
-        if single_root.value.capitalize() in symbols:
-            graph.add_edge(previous_root, single_root)
-
-            if word not in processed:
-                single_stem = word[1:]
-                segments(single_stem, position + 1, previous_root=single_root)
-
-        if len(word) >= 2:
-            double_root = Node(word[0:2], position)
-            if double_root.value.capitalize() in symbols:
-                graph.add_edge(previous_root, double_root)
-
-                if word not in processed:
-                    double_stem = word[2:]
-                    segments(double_stem, position + 2, previous_root=double_root)
-        processed.add(word)
-
-    processed = set()
-    segments(word)
-
-
-if __name__ == '__main__':
-    from pprint import pprint
-    w = 'inconspicuous'
-    g = Graph()
-    build_graph(w, g)
-
-    spellings = list()
-    for first in g._children_of[None]:
-        for last in g._parents_of[None]:
-            for path in find_all_paths(g._children_of, first, last):
-                spellings.append(tuple(node.value for node in path))
-
-    pprint(spellings)
diff --git a/speller.py b/speller.py
@@ -1,17 +1,13 @@
+from collections import defaultdict, namedtuple
 from functools import lru_cache
-from itertools import chain, product
 import logging
-import dag
 
-# TODO(amin): Profile and optimize
-# TODO(amin): Add performance reporting to log
-# TODO(amin): Use recursion to save time with long words that can't be spelled.
 # TODO(amin): Convert symbol tuple to element name or atomic number tuple
 
 log = logging.getLogger(__name__)
 log.addHandler(logging.NullHandler())
 
-ELEMENTS = (
+ELEMENTS = {
     'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al',
     'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe',
     'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y',
@@ -22,13 +18,13 @@ ELEMENTS = (
     'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No',
     'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'Nh', 'Fl',
     'Mc', 'Lv', 'Ts', 'Og'
-)
+}
 
 
 # TODO(amin): Use optional caching/memoization to improve performance
 # TODO(amin): Support appostrophies
 # TODO(amin): Add option to require no repeated symbols
-def spell(word, use_graph=False, symbols=ELEMENTS):
+def spell(word, symbols=ELEMENTS):
     """Return a list of any possible ways to spell a word
     with a given set of symbols.
 
@@ -38,88 +34,122 @@ def spell(word, use_graph=False, symbols=ELEMENTS):
     """
     log.info('Word: {}'.format(word))
 
-    if use_graph:
-        log.debug('Using graph speller')
-        g = dag.Graph()
-        dag.build_graph(word, g)
+    log.debug('Using graph speller')
+    g = Graph()
+    build_graph(word, g)
 
-        spellings = list()
-        for first in g.firsts():
-            for last in g.lasts():
-                for path in dag.find_all_paths(g._children_of, first, last):
-                    spellings.append(tuple(node.value for node in path))
+    spellings = list()
+    for first in g.firsts():
+        for last in g.lasts():
+            for path in find_all_paths(g._children_of, first, last):
+                spellings.append(tuple(node.value for node in path))
 
-    else:
-        groupings = generate_groupings(len(word))
-        spellings = [map_word(word, grouping) for grouping in groupings]
-
-    elemental_spellings = [
+    elemental_spellings = sorted([
         tuple(token.capitalize() for token in spelling)
         for spelling in spellings
-        # set operation: set of chars in spelling is subset of set of symbols
-        if set(s.lower() for s in spelling) <= set(s.lower() for s in symbols)
-    ]
+    ], reverse=True)
 
     log.info('Spellings: {}'.format(elemental_spellings))
 
     return elemental_spellings
 
 
-@lru_cache(maxsize=None)
-def generate_groupings(word_length, batch_sizes=(1, 2)):
-    """Return all groupings for a word of a given length.
+# A single node of the graph.
+Node = namedtuple('Node', ['value', 'position'])
 
-    A grouping is a tuple representing the distribution of
-    characters in a word. By default, characters can be in
-    batches of 1 or 2.
 
-    Example:
-    >>> generate_groupings(4)
-    ((2, 2), (1, 1, 2), (1, 2, 1), (2, 1, 1), (1, 1, 1, 1))
+class Graph():
+    """A directed acyclic graph that stores all possible
+    elemental spellings of a word.
+    """
+    def __init__(self):
+        self._parents_of = defaultdict(set)
+        self._children_of = defaultdict(set)
+
+    def firsts(self):
+        """Return nodes with no parents."""
+        return self._children_of[None]
+
+    def lasts(self):
+        """Return nodes with no children."""
+        return self._parents_of[None]
+
+    def add_edge(self, parent, child):
+        """Add a parent-child relatonship to the graph.
+        None is ok as a key, but not a value.
+        """
+        if parent is not None:
+            self._parents_of[child].add(parent)
+        if child is not None:
+            self._children_of[parent].add(child)
+
+    def edges(self):
+        """Return a list of all parent-child relationships."""
+        return [
+            (parent, child)
+            for parent in self._children_of
+            for child in self._children_of[parent]
+        ]
+
+    def export(self):
+        """Print a string to stdout that can be interpreted by
+        Graphviz.
+        """
+        print('digraph G {')
+        for (parent, child) in self.edges():
+            a = None if parent is None else parent.value
+            b = None if child is None else child.value
+            print('\t{} -> {}'.format(a, b))
+        print('}')
+
+
+def find_all_paths(graph, start, end, path=[]):
+    """Return a list of all paths through the graph from start
+    to end.
+    Based on https://www.python.org/doc/essays/graphs/
+    """
+    path = path + [start]
+    if start == end:
+        return [path]
+    if start not in graph.keys():
+        return []
+    paths = []
+    for node in graph[start]:
+        if node not in path:
+            newpaths = find_all_paths(graph, node, end, path)
+            for newpath in newpaths:
+                paths.append(tuple(newpath))
+    return paths
+
+
+def build_graph(word, graph, symbols=ELEMENTS):
+    """Given a word and a graph, recursively find all single and
+    double-character tokens in the word and add them to the graph.
     """
-    cartesian_products = (
-        product(batch_sizes, repeat=r)
-        for r in range(1, word_length + 1)
-    )
-
-    # include only groupings that represent the correct number of chars
-    groupings = tuple(
-        grouping
-        for grouping in chain.from_iterable(cartesian_products)
-        if sum(grouping) == word_length
-    )
-
-    log.debug('Groupings: {}'.format(groupings))
-    log.debug(generate_groupings.cache_info())
-
-    return groupings
-
 
-def map_word(word, grouping):
-    """Return a word mapped to a grouping.
+    def segments(word, position=0, previous_root=None):
+        if word == '':
+            graph.add_edge(previous_root, None)
+            return
 
-    Example:
-    >>> map_word('because', (1, 2, 1, 1, 2))
-    ('b', 'ec', 'a', 'u', 'se')
-    """
-    if len(word) != sum(grouping):
-        raise ValueError(
-            'Word length ({}) != sum of elements in grouping ({})'.format(
-                len(word), sum(grouping))
-        )
+        single_root = Node(word[0], position)
+        if single_root.value.capitalize() in symbols:
+            graph.add_edge(previous_root, single_root)
 
-    chars = (c for c in word)
+            if word not in processed:
+                segments(word[1:], position + 1, previous_root=single_root)
 
-    mapped = []
-    for batch_size in grouping:
-        batch = ""
-        for _ in range(batch_size):
-            batch += next(chars)
-        mapped.append(batch)
+        if len(word) >= 2:
+            double_root = Node(word[0:2], position)
+            if double_root.value.capitalize() in symbols:
+                graph.add_edge(previous_root, double_root)
 
-    log.debug('Grouping: {}. Mapped word: {}'.format(grouping, mapped))
+                if word not in processed:
+                    segments(word[2:], position + 2, previous_root=double_root)
+        processed.add(word)
 
-    return tuple(mapped)
+    processed = set()
+    segments(word)
 
 
 if __name__ == '__main__':
diff --git a/stoichiograph.py b/stoichiograph.py
@@ -61,11 +61,6 @@ def get_args():
         '-V', '--version', action='store_true',
         help='print version info and exit'
     )
-    # TODO(amin): Remove this
-    parser.add_argument(
-        '--use-graph', action='store_true',
-        help='use the graph-based speller'
-    )
 
     return parser.parse_args()
 
@@ -121,9 +116,9 @@ def main():
 
     for word in words:
         if TUPLES:
-            spellings = speller.spell(word, use_graph=args.use_graph)
+            spellings = speller.spell(word)
         else:
-            spellings = [''.join(s) for s in speller.spell(word, use_graph=args.use_graph)]
+            spellings = [''.join(s) for s in speller.spell(word)]
 
         if spellings:
             spellable[word] = spellings
diff --git a/tests.py b/tests.py
@@ -1,7 +1,6 @@
-import pytest
 import speller
 
-ELEMENTS = (
+ELEMENTS = {
     'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al',
     'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe',
     'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y',
@@ -12,34 +11,13 @@ ELEMENTS = (
     'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No',
     'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'Nh', 'Fl',
     'Mc', 'Lv', 'Ts', 'Og'
-)
+}
 
 
 def test_verify_data():
     assert speller.ELEMENTS == ELEMENTS
 
 
-def test_groupings():
-    assert speller.generate_groupings(4, batch_sizes=()) == ()
-
-    assert speller.generate_groupings(4, batch_sizes=(1, 2)) == (
-        (2, 2), (1, 1, 2), (1, 2, 1), (2, 1, 1), (1, 1, 1, 1)
-    )
-
-    assert speller.generate_groupings(4, batch_sizes=(1, 2, 3)) == (
-        (1, 3), (2, 2), (3, 1), (1, 1, 2), (1, 2, 1), (2, 1, 1), (1, 1, 1, 1)
-    )
-
-
-def test_map_word():
-    assert speller.map_word('because', (1, 2, 1, 1, 2)) == ('b', 'ec', 'a', 'u', 'se')
-    assert speller.map_word('osiris', (1, 3, 2)) == ('o', 'sir', 'is')
-
-    with pytest.raises(ValueError):
-        speller.map_word('toolong', (2, 1))
-        speller.map_word('short', (2, 2, 2))
-
-
 def test_elemental_spelling():
     assert speller.spell('amputation') == [
         ('Am', 'Pu', 'Ta', 'Ti', 'O', 'N'),