This is the updated version that generates all possible constants in advance.

By chance, I found an even faster correct algorithm without using any patterns

“”"

pilgrim_spm.py

(C) Christian Tismer-Sperling 2023-08-02

# The Roman Numerals Converter as Structural Patterm Matching

This is a proof of concept, to learn about SPM and to see how far we get.

## The idea:

The Pilgrim algorithm is not the fastest, but completely correct, because

it can never admit a Roman Numeral that is not allowed.

Instead of implementing another sub-optimal algorithm, this implementation

uses the identical idea as the regex based Pilgrim solution, BUT it models

the pattern matching using SPM.

## How it works:

In the implementation, you can see the original regex code as comment.

Below that is the equivalent implementation as SPM.

If you look at match_four, this is easiest to understand. It shows how

we map the regex to equivalent SPM. In match_group1, this function is

then turned into a parameterless function using functools.partial .

The match_digit function is a bit more complicated because there are more

alternatives. The variable names have been choosen to be a little reminding

of the Roman characters. As an additional complication, we cannot use a

variable multiple times in a “case” clause, so we need to fix that with

guard-if constructs.

We could of course also have used character constants, directly and do all

calculations inline. But that would have grown the code size even bigger.

The elegance of this approach is that we can use the same function over

and over with different constants, see match_seq.

## Things to try

This attempt was almost as fast as the Pilgrim solution.

If we really have too much time, we might write instead a generator for

all expressions, making them all constants. That might give some speedup

but also more code bloat.

2023-08-02: Tried this, after using constants everywhere a little faster

than Pilgrim’s algorithm, at least on an M1 Mac Studio.

## The fastest version

The fastest version is now implemented very differently:

- generate a fast result using
`from_roman_numeral`

which might be erroneous
- convert it back with roman.toRoman (very fast)
- if the created string is the input string, it is correct.

“”"

```
import re
import roman
from functools import partial
from textwrap import dedent
__all__ = ["from_roman_spm", "RomanError", "InvalidRomanNumeralError"]
class RomanError(Exception):
pass
class InvalidRomanNumeralError(RomanError):
pass
# This is copied from M. Pilgrim for reference.
romanNumeralPattern = re.compile("""
^ # beginning of string
M{0,4} # thousands - 0 to 4 M's
(CM|CD|D?C{0,3}) # hundreds - 900 (CM), 400 (CD), 0-300 (0 to 3 C's),
# or 500-800 (D, followed by 0 to 3 C's)
(XC|XL|L?X{0,3}) # tens - 90 (XC), 40 (XL), 0-30 (0 to 3 X's),
# or 50-80 (L, followed by 0 to 3 X's)
(IX|IV|V?I{0,3}) # ones - 9 (IX), 4 (IV), 0-3 (0 to 3 I's),
# or 5-8 (V, followed by 0 to 3 I's)
$ # end of string
""", re.VERBOSE)
value_map = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
def match_four(*seq):
# M{0,4}
match seq:
case "M", "M", "M", "M", *rest:
return 4 * value_map["M"], rest
case "M", "M", "M", *rest:
return 3 * value_map["M"], rest
case "M", "M", *rest:
return 2 * value_map["M"], rest
case "M", *rest:
return value_map["M"], rest
return 0, seq
# Old version, slightly slower.
def match_digit(C, M, D, *seq):
# (CM|CD|D?C{0,3})
match seq:
# CM | CD
case xC, xM, *r if xC == C and xM == M:
return value_map[M] - value_map[C], r
case xC, xD, *r if xC == C and xD == D:
return value_map[D] - value_map[C], r
# D C{0..3}
case xD, xC, a, b, *r if xD == D and C == xC == a == b:
return value_map[D] + value_map[C] * 3, r
case xD, xC, a, *r if xD == D and xC == C == a:
return value_map[D] + value_map[C] * 2, r
case xD, xC, *r if xD == D and xC == C:
return value_map[D] + value_map[C], r
case xD, *r if xD == D:
return value_map[D], r
# C{0..3}
case xC, a, b, *r if xC == C == a == b:
return value_map[C] * 3, r
case xC, a, *r if xC == C == a:
return value_map[C] * 2, r
case xC, *r if xC == C:
return value_map[C], r
return 0, seq
# New version, faster after I generated everything..
def gen_match_digit(name, C, M, D):
s = dedent(f"""
def {name}(*seq):
# (CM|CD|D?C{{0,3}})
match seq:
# CM | CD
case "{C}", "{M}", *r:
return {value_map[M] - value_map[C]}, r
case "{C}", "{D}", *r:
return {value_map[D] - value_map[C]}, r
# D C{{0..3}}
case "{D}", "{C}", "{C}", "{C}", *r:
return {value_map[D] + value_map[C] * 3}, r
case "{D}", "{C}", "{C}", *r:
return {value_map[D] + value_map[C] * 2}, r
case "{D}", "{C}", *r:
return {value_map[D] + value_map[C]}, r
case "{D}", *r:
return value_map["{D}"], r
# C{{0..3}}
case "{C}", "{C}", "{C}", *r:
return {value_map[C] * 3}, r
case "{C}", "{C}", *r:
return {value_map[C] * 2}, r
case "{C}", *r:
return {value_map[C]}, r
return 0, seq
""")
exec(s, globals())
match_group1 = match_four
# match_group2 = partial(match_digit, "C", "M", "D")
# match_group3 = partial(match_digit, "X", "C", "L")
# match_group4 = partial(match_digit, "I", "X", "V")
# This version is slightlly faster, almost the same as Pilgrim's.
gen_match_digit("match_group2", "C", "M", "D")
gen_match_digit("match_group3", "X", "C", "L")
gen_match_digit("match_group4", "I", "X", "V")
def match_seq(*seq):
res1, seq = match_group1(*seq)
res2, seq = match_group2(*seq)
res3, seq = match_group3(*seq)
res4, seq = match_group4(*seq)
if not seq:
return res1 + res2 + res3 + res4
def from_roman_spm(s):
# special case
if s == 'N':
return 0
if not s:
raise InvalidRomanNumeralError('Input can not be blank')
res = match_seq(*s)
if res == None:
raise InvalidRomanNumeralError(f"Invalid Roman numeral: {s}")
return res
# For comparison, one of the many fast but incomplete solution from the internet.
# https://bas.codes/posts/python-roman-numerals#converting-roman-numerals-to-numbers-in-python
def from_roman_numeral(numeral):
value_map = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
value = 0
last_digit_value = 0
for roman_digit in numeral[::-1]: # 1
digit_value = value_map[roman_digit]
if digit_value >= last_digit_value: # 2
value += digit_value
last_digit_value = digit_value
else: # 3
value -= digit_value
return value
def from_roman_fastest(numeral):
if numeral == 'N':
return 0
num = from_roman_numeral(numeral)
cmp = roman.toRoman(num)
if numeral != cmp:
raise InvalidRomanNumeralError(f"Invalid Roman numeral: {numeral}")
return num
if __name__ == "__main__":
import argparse
import sys
parser = argparse.ArgumentParser()
group = parser.add_mutually_exclusive_group()
group.add_argument("--unittest", "-u", action="store_true")
group.add_argument("--benchmark", "-b", action="store_true")
options = parser.parse_args()
if options.unittest:
import unittest
class TestCompat(unittest.TestCase):
def test_all_correct(self):
for num in range(5000):
s = roman.toRoman(num)
my_result = from_roman_spm(s)
orig_result = roman.fromRoman(s)
self.assertEqual(my_result, orig_result)
def test_all_correct_fastest(self):
for num in range(5000):
s = roman.toRoman(num)
my_result = from_roman_fastest(s)
orig_result = roman.fromRoman(s)
self.assertEqual(my_result, orig_result)
def test_incomplete(self):
ok_arg = "MCMLXXV"
bad_arg = "MC" + ok_arg
# Correct cases are correctly handled
self.assertEqual(from_roman_spm(ok_arg), 1975)
self.assertEqual(from_roman_numeral(ok_arg), 1975)
self.assertEqual(from_roman_fastest(ok_arg), 1975)
# The bad argument is correctly not recognized
with self.assertRaises(RomanError):
from_roman_spm(bad_arg)
with self.assertRaises(RomanError):
from_roman_fastest(bad_arg)
# but the incomplete solution recognizes it
self.assertEqual(from_roman_numeral(bad_arg), 2875)
unittest.main(argv=sys.argv[:1], verbosity=2)
if options.benchmark:
from timeit import timeit
from roman import fromRoman, toRoman
fname = "from_roman_spm"
t = timeit(fname + "('MCMLXXV')", globals=globals())
print(f"A million times {fname!r:20}: {t:g} s")
fname = "fromRoman"
t = timeit(fname + "('MCMLXXV')", globals=globals())
print(f"A million times {fname!r:20}: {t:g} s (Mark Pilgrim)")
fname = "from_roman_fastest"
t = timeit(fname + "('MCMLXXV')", globals=globals())
print(f"A million times {fname!r:20}: {t:g} s (incomplete with Pilgrim)")
fname = "toRoman"
t = timeit(fname + "(1975)", globals=globals())
print(f"A million times {fname!r:20}: {t:g} s (Mark Pilgrim)")
fname = "from_roman_numeral"
t = timeit(fname + "('MCMLXXV')", globals=globals())
print(f"A million times {fname!r:20}: {t:g} s (incomplete)")
sys.exit(0)
parser.print_help()
```