Help me with this web scraper for goodreads.com lists

flowey41 · November 25, 2024, 1:38pm

This is my code: It did work a year ago.

from bs4 import BeautifulSoup
import requests
from time import sleep
from os import system, name

# introduction
print('This program sorts standalone books or first books in a series over <specify_amount> ratings of any given')
input('list on goodreads.com by average rating across all pages and stores the results in a txt file.')

# taking user input
file = input('What shall be the name of the file?: ') + '.txt'
base_url = input('Return the link of the list you want to sort: ')
url_template = base_url + '?page={}'
pages = int(input('How many pages does the list have?: '))
min_ratings =int(input('Specify the minimum amount of ratings a book should have: '))

sleep(1)

# making place for the book details
print('Making place for book details...')
book_details = []

sleep(1)

# searching for book details
print(f'Extracting book details and sorting out books not first in a series and under {min_ratings} ratings...')

sleep(1)

for page in range(1, pages + 1):
    current_url = url_template.format(page)
    print(f'Processing page {page}/{pages}...', end='\r')
    # setting up the web scraper
    html_text = requests.get(current_url).text
    soup = BeautifulSoup(html_text, 'lxml')
    books = soup.find_all('tr')
    # extracting book details
    for book in books:
        title = book.find('span', itemprop="name").text.strip()
        author = book.find('span', itemprop="author").find('span', itemprop="name").text.strip()
        rating = book.find('span', class_="minirating").text.split(' ')
        num_ratings = int(rating[-2].replace(',', ''))
        avg_rating = None
        # searching for the avg rating in rating
        for element in rating:
            try:
                avg_rating = float(element)
                break
            except ValueError:
                continue
        # checking if book is first in series
        first_in_series = not any(char.isdigit() and char != '1' for char in title) and not '11' in title
        # sorting out books under <min_ratings>
        if num_ratings > min_ratings and first_in_series:
            book_details.append({'title': title, 'author': author, 'avg_rating': avg_rating})

sleep(1)

# sorting the books by average rating
print('Sorting books by average rating...')
book_details.sort(key=lambda x: x['avg_rating'], reverse=True)

sleep(1)

# writing results to txt file
print('Writing results to the file...')
with open(file, 'w', encoding='utf-8') as f:
    for book in book_details:
        f.write(f"\n{book['title']} by {book['author']} / {book['avg_rating']}\n")

sleep(1)

print('Done.')

r0w · December 4, 2024, 4:35pm

Could you please explain your exact problem
The only thing I noticed, while quickly skimming over it, was that the numerous sleeps might lead to problems if the printing is line-buffered and not flushed properly.
This would have been a problem a year ago as well, though.