I was recently working on parsing html txt using beautiful soup using python. I wrote the following code for scraping the tables so I could print them.
#TABLES
# let's grab the all pages code.
pages_dict = filing_documents[document_id]['pages_code']
# initalize a dictionary to store all the anchors we find.
tables_dict = {}
new_tables_dict = {}
# loop through each page
for page_num in pages_dict:
# grab the actual text
page_code = pages_dict[page_num]
# find all the anchors in the page, that have the attribute 'name'
tables_found = page_code.find_all('table')
# number of anchors found
num_found = len(tables_found)
# each page is going to be checked, so let's have another dictionary that'll house all the anchors found.
tables_dict[page_num]= {(table_id + 1): table for table_id, table in enumerate(tables_found)}
if len(tables_dict) != 0:
# loop through the dictionary
for table_id in tables_dict:
# grab the table
table_html = tables_dict[table_id]
# grab all the rows.
table_rows = table_html.find_all('tr')
# parse the table, first loop through the rows, then each element, and then parse each element.
parsed_table = [
[element.get_text(strip=True) for element in row.find_all('td')]
for row in table_rows
]
# keep the original just to be safe.
tables_dict[table_id]['original_table'] = table_html
# add the new parsed table.
tables_dict[table_id]['parsed_table'] = parsed_table
# here some additional steps you can take to clean up the data - Removing '$'.
parsed_table_cleaned = [
[element for element in row if element != '$']
for row in parsed_table
]
# here some additional steps you can take to clean up the data - Removing Blanks.
parsed_table_cleaned = [
[element for element in row if element != None]
for row in parsed_table_cleaned
]
else:
# if there are no tables then just have the id equal NONE
tables_dict[1]['original_table'] = None
tables_dict[1]['parsed_table'] = None
filing_documents[document_id]['anchor_search'] = link_anchor_dict
filing_documents[document_id]['tables_search'] = tables_dict
a = filing_documents[document_id]['tables_search']['table_id']['parsed_table']
print(a)
When I try to print a, it comes up with the attribute error dict object has no attribute find_all error. Is there anything I can add or reduce to fix this?