commit 0890827b23182eda96881b5e2401c9650fb9d2cd
parent a5d52a852a0ff16cf915ca449e995e16ce111da0
Author: amin <dev@aminmesbah.com>
Date: Wed, 22 Feb 2017 07:40:36 +0000
Replace old spelling method with graph method.
The new graph code is immensely more efficient!
FossilOrigin-Name: 855d613c68ca66fd8865d9f881eede326adef28f26528d257ed0e450a394e6c4
Diffstat:
D | dag.py | | | 126 | ------------------------------------------------------------------------------- |
M | speller.py | | | 172 | ++++++++++++++++++++++++++++++++++++++++++++++--------------------------------- |
M | stoichiograph.py | | | 9 | ++------- |
M | tests.py | | | 26 | ++------------------------ |
4 files changed, 105 insertions(+), 228 deletions(-)
diff --git a/dag.py b/dag.py
@@ -1,126 +0,0 @@
-from collections import defaultdict, namedtuple
-
-ELEMENTS = {
- 'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al',
- 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe',
- 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y',
- 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb',
- 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd',
- 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir',
- 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn', 'Fr', 'Ra', 'Ac',
- 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No',
- 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'Nh', 'Fl',
- 'Mc', 'Lv', 'Ts', 'Og'
-}
-
-# A single node of the graph.
-Node = namedtuple('Node', ['value', 'position'])
-
-
-class Graph():
- """A directed acyclic graph that stores all possible
- elemental spellings of a word.
- """
- def __init__(self):
- self._parents_of = defaultdict(set)
- self._children_of = defaultdict(set)
-
- def firsts(self):
- """Return nodes with no parents."""
- return self._children_of[None]
-
- def lasts(self):
- """Return nodes with no children."""
- return self._parents_of[None]
-
- def add_edge(self, parent, child):
- """Add a parent-child to child relatonship to the graph.
- None is ok as a key, but not a value.
- """
- if parent is not None:
- self._parents_of[child].add(parent)
- if child is not None:
- self._children_of[parent].add(child)
-
- def edges(self):
- """Return a list of all parent-child relationships."""
- return [
- (parent, child)
- for parent in self._children_of
- for child in self._children_of[parent]
- ]
-
- def export(self):
- """Print a string to stdout that can be interpreted by
- Graphviz.
- """
- print('digraph G {')
- for (parent, child) in self.edges():
- print('\t{} -> {}'.format(parent.value, child.value))
- print('}')
-
-
-def find_all_paths(graph, start, end, path=[]):
- """Return a list of all paths through the graph from start
- to end.
- Based on https://www.python.org/doc/essays/graphs/
- """
- path = path + [start]
- if start == end:
- return [path]
- if start not in graph.keys():
- return []
- paths = []
- for node in graph[start]:
- if node not in path:
- newpaths = find_all_paths(graph, node, end, path)
- for newpath in newpaths:
- paths.append(tuple(newpath))
- return paths
-
-
-def build_graph(word, graph, symbols=ELEMENTS):
- """Given a word and a graph, recursively find all single and
- double-character tokens in the word and add them to the graph.
- """
-
- def segments(word, position=0, previous_root=None):
- if word == '':
- graph.add_edge(previous_root, None)
- return
-
- single_root = Node(word[0], position)
- if single_root.value.capitalize() in symbols:
- graph.add_edge(previous_root, single_root)
-
- if word not in processed:
- single_stem = word[1:]
- segments(single_stem, position + 1, previous_root=single_root)
-
- if len(word) >= 2:
- double_root = Node(word[0:2], position)
- if double_root.value.capitalize() in symbols:
- graph.add_edge(previous_root, double_root)
-
- if word not in processed:
- double_stem = word[2:]
- segments(double_stem, position + 2, previous_root=double_root)
- processed.add(word)
-
- processed = set()
- segments(word)
-
-
-if __name__ == '__main__':
- from pprint import pprint
- w = 'inconspicuous'
- g = Graph()
- build_graph(w, g)
-
- spellings = list()
- for first in g._children_of[None]:
- for last in g._parents_of[None]:
- for path in find_all_paths(g._children_of, first, last):
- spellings.append(tuple(node.value for node in path))
-
- pprint(spellings)
diff --git a/speller.py b/speller.py
@@ -1,17 +1,13 @@
+from collections import defaultdict, namedtuple
from functools import lru_cache
-from itertools import chain, product
import logging
-import dag
-# TODO(amin): Profile and optimize
-# TODO(amin): Add performance reporting to log
-# TODO(amin): Use recursion to save time with long words that can't be spelled.
# TODO(amin): Convert symbol tuple to element name or atomic number tuple
log = logging.getLogger(__name__)
log.addHandler(logging.NullHandler())
-ELEMENTS = (
+ELEMENTS = {
'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al',
'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe',
'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y',
@@ -22,13 +18,13 @@ ELEMENTS = (
'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No',
'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'Nh', 'Fl',
'Mc', 'Lv', 'Ts', 'Og'
-)
+}
# TODO(amin): Use optional caching/memoization to improve performance
# TODO(amin): Support appostrophies
# TODO(amin): Add option to require no repeated symbols
-def spell(word, use_graph=False, symbols=ELEMENTS):
+def spell(word, symbols=ELEMENTS):
"""Return a list of any possible ways to spell a word
with a given set of symbols.
@@ -38,88 +34,122 @@ def spell(word, use_graph=False, symbols=ELEMENTS):
"""
log.info('Word: {}'.format(word))
- if use_graph:
- log.debug('Using graph speller')
- g = dag.Graph()
- dag.build_graph(word, g)
+ log.debug('Using graph speller')
+ g = Graph()
+ build_graph(word, g)
- spellings = list()
- for first in g.firsts():
- for last in g.lasts():
- for path in dag.find_all_paths(g._children_of, first, last):
- spellings.append(tuple(node.value for node in path))
+ spellings = list()
+ for first in g.firsts():
+ for last in g.lasts():
+ for path in find_all_paths(g._children_of, first, last):
+ spellings.append(tuple(node.value for node in path))
- else:
- groupings = generate_groupings(len(word))
- spellings = [map_word(word, grouping) for grouping in groupings]
-
- elemental_spellings = [
+ elemental_spellings = sorted([
tuple(token.capitalize() for token in spelling)
for spelling in spellings
- # set operation: set of chars in spelling is subset of set of symbols
- if set(s.lower() for s in spelling) <= set(s.lower() for s in symbols)
- ]
+ ], reverse=True)
log.info('Spellings: {}'.format(elemental_spellings))
return elemental_spellings
-@lru_cache(maxsize=None)
-def generate_groupings(word_length, batch_sizes=(1, 2)):
- """Return all groupings for a word of a given length.
+# A single node of the graph.
+Node = namedtuple('Node', ['value', 'position'])
- A grouping is a tuple representing the distribution of
- characters in a word. By default, characters can be in
- batches of 1 or 2.
- Example:
- >>> generate_groupings(4)
- ((2, 2), (1, 1, 2), (1, 2, 1), (2, 1, 1), (1, 1, 1, 1))
+class Graph():
+ """A directed acyclic graph that stores all possible
+ elemental spellings of a word.
+ """
+ def __init__(self):
+ self._parents_of = defaultdict(set)
+ self._children_of = defaultdict(set)
+
+ def firsts(self):
+ """Return nodes with no parents."""
+ return self._children_of[None]
+
+ def lasts(self):
+ """Return nodes with no children."""
+ return self._parents_of[None]
+
+ def add_edge(self, parent, child):
+ """Add a parent-child relatonship to the graph.
+ None is ok as a key, but not a value.
+ """
+ if parent is not None:
+ self._parents_of[child].add(parent)
+ if child is not None:
+ self._children_of[parent].add(child)
+
+ def edges(self):
+ """Return a list of all parent-child relationships."""
+ return [
+ (parent, child)
+ for parent in self._children_of
+ for child in self._children_of[parent]
+ ]
+
+ def export(self):
+ """Print a string to stdout that can be interpreted by
+ Graphviz.
+ """
+ print('digraph G {')
+ for (parent, child) in self.edges():
+ a = None if parent is None else parent.value
+ b = None if child is None else child.value
+ print('\t{} -> {}'.format(a, b))
+ print('}')
+
+
+def find_all_paths(graph, start, end, path=[]):
+ """Return a list of all paths through the graph from start
+ to end.
+ Based on https://www.python.org/doc/essays/graphs/
+ """
+ path = path + [start]
+ if start == end:
+ return [path]
+ if start not in graph.keys():
+ return []
+ paths = []
+ for node in graph[start]:
+ if node not in path:
+ newpaths = find_all_paths(graph, node, end, path)
+ for newpath in newpaths:
+ paths.append(tuple(newpath))
+ return paths
+
+
+def build_graph(word, graph, symbols=ELEMENTS):
+ """Given a word and a graph, recursively find all single and
+ double-character tokens in the word and add them to the graph.
"""
- cartesian_products = (
- product(batch_sizes, repeat=r)
- for r in range(1, word_length + 1)
- )
-
- # include only groupings that represent the correct number of chars
- groupings = tuple(
- grouping
- for grouping in chain.from_iterable(cartesian_products)
- if sum(grouping) == word_length
- )
-
- log.debug('Groupings: {}'.format(groupings))
- log.debug(generate_groupings.cache_info())
-
- return groupings
-
-def map_word(word, grouping):
- """Return a word mapped to a grouping.
+ def segments(word, position=0, previous_root=None):
+ if word == '':
+ graph.add_edge(previous_root, None)
+ return
- Example:
- >>> map_word('because', (1, 2, 1, 1, 2))
- ('b', 'ec', 'a', 'u', 'se')
- """
- if len(word) != sum(grouping):
- raise ValueError(
- 'Word length ({}) != sum of elements in grouping ({})'.format(
- len(word), sum(grouping))
- )
+ single_root = Node(word[0], position)
+ if single_root.value.capitalize() in symbols:
+ graph.add_edge(previous_root, single_root)
- chars = (c for c in word)
+ if word not in processed:
+ segments(word[1:], position + 1, previous_root=single_root)
- mapped = []
- for batch_size in grouping:
- batch = ""
- for _ in range(batch_size):
- batch += next(chars)
- mapped.append(batch)
+ if len(word) >= 2:
+ double_root = Node(word[0:2], position)
+ if double_root.value.capitalize() in symbols:
+ graph.add_edge(previous_root, double_root)
- log.debug('Grouping: {}. Mapped word: {}'.format(grouping, mapped))
+ if word not in processed:
+ segments(word[2:], position + 2, previous_root=double_root)
+ processed.add(word)
- return tuple(mapped)
+ processed = set()
+ segments(word)
if __name__ == '__main__':
diff --git a/stoichiograph.py b/stoichiograph.py
@@ -61,11 +61,6 @@ def get_args():
'-V', '--version', action='store_true',
help='print version info and exit'
)
- # TODO(amin): Remove this
- parser.add_argument(
- '--use-graph', action='store_true',
- help='use the graph-based speller'
- )
return parser.parse_args()
@@ -121,9 +116,9 @@ def main():
for word in words:
if TUPLES:
- spellings = speller.spell(word, use_graph=args.use_graph)
+ spellings = speller.spell(word)
else:
- spellings = [''.join(s) for s in speller.spell(word, use_graph=args.use_graph)]
+ spellings = [''.join(s) for s in speller.spell(word)]
if spellings:
spellable[word] = spellings
diff --git a/tests.py b/tests.py
@@ -1,7 +1,6 @@
-import pytest
import speller
-ELEMENTS = (
+ELEMENTS = {
'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al',
'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe',
'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y',
@@ -12,34 +11,13 @@ ELEMENTS = (
'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No',
'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'Nh', 'Fl',
'Mc', 'Lv', 'Ts', 'Og'
-)
+}
def test_verify_data():
assert speller.ELEMENTS == ELEMENTS
-def test_groupings():
- assert speller.generate_groupings(4, batch_sizes=()) == ()
-
- assert speller.generate_groupings(4, batch_sizes=(1, 2)) == (
- (2, 2), (1, 1, 2), (1, 2, 1), (2, 1, 1), (1, 1, 1, 1)
- )
-
- assert speller.generate_groupings(4, batch_sizes=(1, 2, 3)) == (
- (1, 3), (2, 2), (3, 1), (1, 1, 2), (1, 2, 1), (2, 1, 1), (1, 1, 1, 1)
- )
-
-
-def test_map_word():
- assert speller.map_word('because', (1, 2, 1, 1, 2)) == ('b', 'ec', 'a', 'u', 'se')
- assert speller.map_word('osiris', (1, 3, 2)) == ('o', 'sir', 'is')
-
- with pytest.raises(ValueError):
- speller.map_word('toolong', (2, 1))
- speller.map_word('short', (2, 2, 2))
-
-
def test_elemental_spelling():
assert speller.spell('amputation') == [
('Am', 'Pu', 'Ta', 'Ti', 'O', 'N'),