How can I modify my Python code to extract and format list items from a mixed input string as described, including contextualizing each list item?

hkbn8 · September 9, 2023, 2:09pm

Code:

import textwrap


def filter_lists(text):
    # split text into lines
    lines = text.split('\n')

    patterns = [
        r'^[0-9]+\.',  # number list item
        r'^[a-zA-Z]\.',  # letter list item
        r'^\u2022',  # bullet point list item
        r'^[ivx]+\.',  # roman numeral list item
        r'^\u25E6',  # special bullet point list item
        r'^\u2713',  # Checkmark List Item
        r'^[→←↑↓]',  # Arrow List Item (Add arrows as needed)
        r'^♦',  # Diamond List Item
        r'^★',  # Star List Item
        r'^[^\w\s]',  # Emoji List Item (Matches any non-word, non-space character)
        r'^\uE000',  # Icon List Item (Replace with the specific Unicode code for your icon)
        r'^[@#*%!&]',  # Custom Symbol List Item (Add your custom symbols within the brackets)
        r'^(red|blue|green|yellow)',  # Color-Coded List Item (Add color names or codes)
        r'^\d+\.(jpg|png|gif)',  # Image List Item (Matches numbered image file names)
        r'^\[\d{1,3}%\]',  # Progress Bar List Item (Matches percentages in square brackets)
        r'^\[[A-Za-z]+\]',  # Tag or Label List Item (Matches words in square brackets)
        r'^\d+⚫',  # Numbered Icon List Item (Matches numbers followed by a black circle)
        r'^"([^"]+)"',  # Quote List Item (Matches text enclosed in double quotes)
        r'^\d{8}',  # Barcode List Item (Matches 8-digit numbers, adjust as needed)
    ]
    # initialize list for filtered lines
    filtered_lines = []

    # iterate over lines
    for line in lines:
        # iterate over patterns
        for pattern in patterns:
            # if line matches pattern, add to filtered lines
            if re.match(pattern, line.strip()):
                filtered_lines.append(line)
                break

    return '\n'.join(filtered_lines)
import hashlib
import re

def process_string(input_string, prefix_format=None, hierarchy=None):
    output_string = ''
    used_prefixes = {}
    stack = []
    level = 0
    if prefix_format is None:
        first_line = input_string.split('\n')[0]
        prefix_format = infer_prefix_format(first_line)
    if not isinstance(prefix_format, list):
        prefix_format = [prefix_format]
    if hierarchy is None:
        hierarchy = infer_hierarchy(input_string)
    if not isinstance(hierarchy, list):
        hierarchy = [hierarchy]
    input_string = textwrap.dedent(input_string)
    input_string = '\n'.join(textwrap.wrap(input_string, width=80))
    lines = input_string.split('\n')
    if not input_string or input_string.isspace():
        return output_string
    for line in lines:
        line = line.strip()
        if not line:
            output_string += '\n'
            used_prefixes = {k: v for k, v in used_prefixes.items() if k in stack[:level]}
            stack = stack[:level]
            continue
        current_level = len(re.match(r'^(\s+)', line).group(1)) // 4 if re.match(r'^(\s+)', line) else 0
        if current_level > level:
            level += 1
            if len(prefix_format) >= level:
                format_dict = prefix_format[level - 1]
                validate_prefix_format(format_dict)
            else:
                raise ValueError(f"Missing prefix format for level {level}")
            if len(hierarchy) >= level:
                preference_list = hierarchy[level - 1]
                validate_hierarchy(preference_list)
            else:
                raise ValueError(f"Missing hierarchy for level {level}")
        elif current_level < level:
            while current_level < level and stack:
                stack.pop()
                used_prefixes.popitem()
                level -= 1
        else:
            format_dict = prefix_format[level - 1]
            preference_list = hierarchy[level - 1]
        try:
            match = re.match(r'(\d+\.|\w+\.|\w+\-|\w+\:|\S+)(\s+)(.+)?', line)
            if match:
                prefix = match.group(1)
                separator = match.group(2)
                content = match.group(3) or 'N/A'
            else:
                prefix = ''
                separator = ''
                content = ''
            prefix, separator, content, different_level = normalize_prefix(prefix, stack, used_prefixes=used_prefixes, separator=separator, mode='lenient', hierarchy=preference_list)
        except ValueError as e:
            print(e)
            continue
        else:
            if not content or content.isspace():
                content = 'N/A'
            output_string += format_output(prefix, separator, content, level, format_dict, preference_list)
            if different_level:
                stack.append(prefix)
    return output_string
def infer_hierarchy(input_string):
    """
    This function infers the hierarchy from the input string by using a regular expression to extract the prefixes at different levels of indentation from the input string,
    and creates a list based on them.
    """
    inferred_hierarchy = []
    lines = input_string.split('\n')
    for line in lines:
        match = re.match(r'^(\s*)(\d+\.|\w+\.|\w+\-|\w+\:|\S+)', line)
        if match:
            indentation = len(match.group(1))
            prefix = match.group(2)
            if indentation < len(inferred_hierarchy):
                preference_list = inferred_hierarchy[indentation]
                if prefix.isdigit() and 'number' not in preference_list:
                    preference_list.append('number')
                elif prefix.isalpha() and 'letter' not in preference_list:
                    preference_list.append('letter')
                elif not prefix.isdigit() and not prefix.isalpha() and 'other' not in preference_list:
                    preference_list.append('other')
            else:
                preference_list = []
                if prefix.isdigit():
                    preference_list.append('number')
                elif prefix.isalpha():
                    preference_list.append('letter')
                else:
                    preference_list.append('other')
                inferred_hierarchy.append(preference_list)
    return inferred_hierarchy
def validate_hierarchy(preference_list):
    """This function validates a given preference list and raises an exception if it is invalid.
    A valid preference list must be a list of strings that contain only 'number', 'letter', or 'other',
    and must have at least one element.
    """
    if not isinstance(preference_list, list):
        raise ValueError("Preference list must be a list")
    if not preference_list:
        raise ValueError("Preference list cannot be empty")
    for element in preference_list:
        if not isinstance(element, str):
            raise ValueError("Preference list must contain only strings")
        if not (element == 'number' or element == 'letter' or element == 'other'):
            raise ValueError("Preference list must contain only 'number', 'letter', or 'other'")
def format_output(prefix, separator, content, level, format_dict, hierarchy):
    """
    This function formats the output string with proper indentation and formatting based on
    - The prefix type and format.
    - The separator.
    - The hierarchy list that specifies the order of preference for different types of prefixes at each level of indentation.
    """
    output_string = ''
    output_string += ' ' * (level * 4)
    if hierarchy:
        if level < len(hierarchy):
            preference = hierarchy[level]
            if prefix.isdigit():
                output_string += prefix + preference[0] + ' '
            elif prefix.isalpha():
                output_string += prefix + preference[1] + ' '
            else:
                output_string += preference[2].format(prefix) + ' '
        else:
            if prefix.isdigit():
                output_string += prefix + format_dict['number'] + ' '
            elif prefix.isalpha():
                output_string += prefix + format_dict['letter'] + ' '
            else:
                output_string += format_dict['other'].format(prefix) + ' '
    else:
        if prefix.isdigit():
            output_string += prefix + format_dict['number'] + ' '
        elif prefix.isalpha():
            output_string += prefix + format_dict['letter'] + ' '
        else:
            output_string += format_dict['other'].format(prefix) + ' '
    output_string += separator
    output_string += content + '\n'
    return output_string
def validate_prefix_format(format_dict):
    """This function validates a given prefix format dictionary and raises an exception if it is invalid.
    A valid prefix format dictionary must have keys for numbers, letters, and other types of prefixes,
    and values that are valid separators or formats.
    """
    # Check if the format_dict has keys for numbers, letters, and other types of prefixes
    if not ('number' in format_dict and 'letter' in format_dict and 'other' in format_dict):
        raise ValueError("Prefix format dictionary must have keys for numbers, letters, and other types of prefixes")
    # Check if the format_dict has values that are valid separators or formats
    for key, value in format_dict.items():
        # If the value is a list or not a string, use the first element as the value
        if isinstance(value, list) or not isinstance(value, str):
            value = value[0]
        # If the value is not a dot, a dash, or a placeholder for other types of prefixes, raise an exception
        if not (value == '.' or value == '-' or re.match(r'\(\{\}\)', value)):
            raise ValueError(f"Prefix format dictionary must have a valid separator or format for {key} type of prefix")
        # If the key is 'l', add a key for lowercase letters with the same value
        if key == 'l':
            format_dict['letter'] = value

def normalize_prefix(prefix, previous_prefixes, used_prefixes={}, separator='.', mode='lenient', hierarchy=None):
    """
    This function normalizes a given prefix and returns it along with the separator and the content as a tuple.
    A normalized prefix is one that follows a logical sequence or hierarchy based on the previous prefixes at the same level,
    does not contain both numbers and letters, is a valid alphanumeric character or a symbol, and is not repeated at different levels of indentation.
    The hierarchy argument specifies the order of preference for different types of prefixes at each level of indentation.
    For example, hierarchy = [['number', 'letter', 'other'], ['letter', 'number', 'other']] means that at level 1, numbers are preferred over letters and other symbols,
    and at level 2, letters are preferred over numbers and other symbols. If hierarchy is None, then it can be inferred from the input string or assigned a default value.
    """
    format_dict = {'number': '.', 'letter': '-', 'other': '({})'}
    hashed_prefix = hashlib.md5(prefix.encode()).hexdigest()
    # split prefix into number and letter parts
    match = re.match(r'^(\d+)(\w+)', prefix)
    if match:
        number_part = match.group(1)
        letter_part = match.group(2)
        # normalize number part
        number_part, number_separator, _, _ = normalize_prefix(number_part, previous_prefixes, used_prefixes=used_prefixes, separator=format_dict['number'], mode=mode, hierarchy=hierarchy)
        # normalize letter part
        letter_part, letter_separator, _, _ = normalize_prefix(letter_part, previous_prefixes, used_prefixes=used_prefixes, separator=format_dict['letter'], mode=mode, hierarchy=hierarchy)
        # combine number and letter parts
        content = prefix.replace(number_part, '').replace(letter_part, '')
        return (number_part + letter_part, format_dict['number'] if number_part.isdigit() else format_dict['letter'], content, False)
    # use a list of symbols
    symbols = ['•', '◦', '▪', '▫', '▸', '◂', '▴', '▾']
    # check if prefix is a symbol
    if prefix in symbols:
        # normalize symbol according to its position in the list
        index = symbols.index(prefix)
        sorted_symbols = sorted([p for p in previous_prefixes if p in symbols], key=symbols.index)
        new_index = sorted_symbols.index(prefix) if prefix in sorted_symbols else len(sorted_symbols)
        new_prefix = symbols[new_index]
        content = prefix[1:]
        return (new_prefix, format_dict['other'].format(prefix), content, False)
    match = re.match(r'^(\w|\S)(\W+)(.+)', prefix)
    if match:
        prefix = match.group(1)
        separator = match.group(2)
        content = match.group(3)
    else:
        match = re.match(r'^(\w|\S)(.+)', prefix)
        if match:
            prefix = match.group(1)
            separator = ''
            content = match.group(2)
        else:
            match = re.match(r'^(\w|\S)', prefix)
            if match:
                prefix = match.group(1)
                separator = ''
                content = ''
            else:
                prefix = ''
                separator = ''
                content = ''
    # handle empty or whitespace prefixes by assigning a default prefix based on the hierarchy
    if not prefix or prefix.isspace():
        level = len(previous_prefixes)
        if level < len(hierarchy):
            preference_list = hierarchy[level]
            first_type = preference_list[0]
            if first_type == 'number':
                sorted_number_prefixes = sorted([p for p in previous_prefixes if p.isdigit()], key=int)
                new_prefix = str(int(sorted_number_prefixes[-1]) + 1) if sorted_number_prefixes else '1'
            elif first_type == 'letter':
                sorted_letter_prefixes = sorted([p for p in previous_prefixes if p.isalpha()])
                new_prefix = chr(ord(sorted_letter_prefixes[-1]) + 1) if sorted_letter_prefixes else 'A'
            else:
                symbols = ['•', '◦', '▪', '▫', '▸', '◂', '▴', '▾']
                sorted_symbol_prefixes = sorted([p for p in previous_prefixes if p in symbols], key=symbols.index)
                new_prefix = symbols[symbols.index(sorted_symbol_prefixes[-1]) + 1] if sorted_symbol_prefixes else symbols[0]
            return (new_prefix, format_dict[first_type], content, True)
        else:
            return ('*', format_dict['other'].format('*'), content, True)
    # check if prefix contains both numbers and letters
    if re.match(r'\d+\w+|\w+\d+', prefix):
        if mode == 'strict':
            raise ValueError(f"Invalid prefix: {prefix}")
        else:
            unique_prefix = prefix + str(used_prefixes.get(prefix, 0))
            used_prefixes[prefix] = used_prefixes.get(prefix, 0) + 1
            return (unique_prefix, format_dict['other'].format(prefix), content, False)
    # check if prefix is a valid alphanumeric character or a symbol
    if not re.match(r'\w|\S', prefix):
        if mode == 'strict':
            raise ValueError(f"Invalid prefix: {prefix}")
        else:
            return (hashed_prefix, format_dict['other'].format(prefix), content, False)
    # check if there are previous prefixes at the same level
    if previous_prefixes:
        last_prefix = previous_prefixes[-1]
        # check if prefix has the same type as the last prefix
        if (prefix.isdigit() == last_prefix.isdigit()) and (prefix.isalpha() == last_prefix.isalpha()):
            # check if prefix is a number
            if prefix.isdigit():
                # check if prefix has one digit
                if len(prefix) == 1:
                    if mode == 'strict':
                        raise ValueError(f"Inconsistent prefix: {prefix}")
                    else:
                        # increment the prefix by one
                        new_prefix = str(int(prefix) + 1)
                        return (new_prefix, format_dict['number'], content, False)
                # check if prefix is less than or equal to the last prefix
                elif int(prefix) <= int(last_prefix):
                    if mode == 'strict':
                        raise ValueError(f"Inconsistent prefix: {prefix}")
                    else:
                        # sort the prefixes and find the index of the current prefix
                        sorted_prefixes = sorted(previous_prefixes + [prefix], key=int)
                        index = sorted_prefixes.index(prefix)
                        # assign a new prefix based on the index
                        new_prefix = str(index + 1)
                        return (new_prefix, format_dict['number'], content, False)
                else:
                    pass
            # check if prefix is a letter
            elif prefix.isalpha():
                # check if prefix has one letter
                if len(prefix) == 1:
                    if mode == 'strict':
                        raise ValueError(f"Inconsistent prefix: {prefix}")
                    else:
                        # increment the prefix by one
                        new_prefix = chr(ord(prefix) + 1)
                        return (new_prefix, format_dict['letter'], content, False)
                # check if prefix is less than or equal to the last prefix
                elif ord(prefix) <= ord(last_prefix):
                    if mode == 'strict':
                        raise ValueError(f"Inconsistent prefix: {prefix}")
                    else:
                        # sort the prefixes and find the index of the current prefix
                        sorted_prefixes = sorted(previous_prefixes + [prefix])
                        index = sorted_prefixes.index(prefix)
                        # assign a new prefix based on the index
                        new_prefix = chr(ord('A') + index)
                        return (new_prefix, format_dict['letter'], content, False)
                else:
                    pass
            else:
                pass
    else:
        # check if there is a hierarchy for the current level
        if hierarchy:
            level = len(previous_prefixes)
            # check if the hierarchy list has an element for the current level
            if level < len(hierarchy):
                preference_list = hierarchy[level]
                # check if the prefix type matches the first preference for the current level
                if (prefix.isdigit() and preference_list[0] == 'number') or (prefix.isalpha() and preference_list[0] == 'letter') or (not prefix.isdigit() and not prefix.isalpha() and preference_list[0] == 'other'):
                    pass
                else:
                    # handle inconsistent prefixes according to the mode argument
                    if mode == 'strict':
                        raise ValueError(f"Inconsistent prefix: {prefix}")
                    else:
                        # assign a new prefix based on the first preference for the current level
                        first_type = preference_list[0]
                        if first_type == 'number':
                            sorted_number_prefixes = sorted([p for p in previous_prefixes if p.isdigit()], key=int)
                            new_prefix = str(int(sorted_number_prefixes[-1]) + 1) if sorted_number_prefixes else '1'
                        elif first_type == 'letter':
                            sorted_letter_prefixes = sorted([p for p in previous_prefixes if p.isalpha()])
                            new_prefix = chr(ord(sorted_letter_prefixes[-1]) + 1) if sorted_letter_prefixes else 'A'
                        else:
                            symbols = ['•', '◦', '▪', '▫', '▸', '◂', '▴', '▾']
                            sorted_symbol_prefixes = sorted([p for p in previous_prefixes if p in symbols], key=symbols.index)
                            new_prefix = symbols[symbols.index(sorted_symbol_prefixes[-1]) + 1] if sorted_symbol_prefixes else symbols[0]
                        return (new_prefix, format_dict['other'], content, False)
            else:
                return (hashed_prefix, format_dict['other'].format(prefix), content, False)
def infer_prefix_format(input_string):
    """
    This function infers the prefix format from the input string by using a regular expression to extract the prefixes and their separators from the first line of the input string,
    and creates a dictionary based on them.
    """
    inferred_format = []
    first_line = input_string.split('\n')[0]
    matches = re.findall(r'(\w|\S)(\W+)', first_line)
    for match in matches:
        prefix = match[0]
        separator = match[1]
        if prefix.isdigit():
            format_dict = {'number': separator}
        elif prefix.isalpha():
            format_dict = {'letter': separator}
        elif prefix == 'o': # added this line to handle the KeyError: 'o'
            format_dict = {'o': separator} # added this line to handle the KeyError: 'o'
        else:
            format_dict = {'other': '({})'}
        inferred_format.append(format_dict)
    return inferred_format

testData = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent hendrerit dapibus posuere. Suspendisse tempus, mi rhoncus gravida pulvinar, orci orci congue diam, vel malesuada felis elit sit amet nisl. Aliquam sagittis facilisis rhoncus. \n\n1. England Cities:\n- Liverpool\n- London\n- Huyton\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\nB. Spain Cities: \na. Place1 \nb. Place2 \nc. Place3\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\nIII. Germany Cities: \ni. Place1 \nii. Place2 \niii. Place3\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\n1. England Cities:\n 1. Liverpool\n 2. London\n 3. Huyton\n \nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\n• Spain Cities:\n◦ Place1\n◦ Place2\n◦ Place3\n\na. Germany\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\na. England\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent hendrerit dapibus posuere. Suspendisse tempus, mi rhoncus gravida pulvinar, orci orci congue diam, vel malesuada felis elit sit amet nisl. Aliquam sagittis facilisis rhoncus."
output1 = filter_lists(testData)
print(output1)
output2 = process_string(output1)
print("The output is: "+output2)

Purpose of Code:

I’ve created a Python function that should take a string as input. This string will contain a series of lists along with non-list items. The code should output only the list data, using the item at the top of each list to contextualize the printing of each list item, and remove non-list items unrelated to lists. An example of the input and expected output is provided below.

Problem with Code:

I’ve been able to extract only the list items from the input, but thus far, I haven’t been able to output them in the format I described earlier (as shown in the expected output below).

Input:

"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent hendrerit dapibus posuere. Suspendisse tempus, mi rhoncus gravida pulvinar, orci orci congue diam, vel malesuada felis elit sit amet nisl. Aliquam sagittis facilisis rhoncus. \n\n1. England Cities:\n- Liverpool\n- London\n- Huyton\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\nB. Spain Cities: \na. Place1 \nb. Place2 \nc. Place3\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\nIII. Germany Cities: \ni. Place1 \nii. Place2 \niii. Place3\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\n1. England Cities:\n 1. Liverpool\n 2. London\n 3. Huyton\n \nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\n• Spain Cities:\n◦ Place1\n◦ Place2\n◦ Place3\n\na. Germany\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\na. England\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent hendrerit dapibus posuere. Suspendisse tempus, mi rhoncus gravida pulvinar, orci orci congue diam, vel malesuada felis elit sit amet nisl. Aliquam sagittis facilisis rhoncus."

Expected Output:

['England Cities: Liverpool', 'England Cities: London', 'England Cities: Huyton', 'Spain Cities: Place1', 'Spain Cities: Place2', 'Spain Cities: Place3', 'Germany Cities: Place1', 'Germany Cities: Place2', 'Germany Cities: Place3', 'England Cities: Liverpool', 'England Cities: London', 'England Cities: Huyton', 'Spain Cities: Place1', 'Spain Cities: Place2', 'Spain Cities: Place3', 'Germany', 'England']

Actual Output:


1. England Cities:
- Liverpool
- London
- Huyton
B. Spain Cities: 
a. Place1 
b. Place2 
c. Place3
i. Place1 
ii. Place2 
iii. Place3
1. England Cities:
 1. Liverpool
 2. London
 3. Huyton
• Spain Cities:
◦ Place1
◦ Place2
◦ Place3
a. Germany
a. England

h ({}).
h ({})lace2
h ({})iverpool
h ({})ermany

How can I modify my code to get it working?

hansgeunsmeyer · September 9, 2023, 4:19pm

It seems you spent of lot of time writing this, but the code seems just too complicated and brittle for its purpose. Have you considered writing a recursive parser instead? For instance using SLY (Sly Lex Yacc) — sly 0.0 documentation written by David Beazley? Using a library like Sly makes it infinitely more easy to debug and maintain this kind of code.
(It might seem crazy now to rewrite everything you already wrote - but it might pay off in the end… And seeing what you wrote, I’m sure you will like the Sly package.)
Alternatively – have you tried to trace into it to see where it’s going wrong?