I have Python 3.11 on Windows 11.
- This is my 6th different program I’ve found on the internet to try and write something that will summarize a PDF locally.
- My PDFs were downloaded from pubmed.gov. Each one gets an error in byte position 10 but the byte value that the Python program doesn’t like varies.
- Can you get this to work with your own PDF file? I have tried 3 different files but they were all from Pubmed. A fourth file not from Pubmed also got the same error.
The latest error is: “ERROR: ‘utf-8’ codec can’t decode byte 0xe2 in position 10: invalid continuation byte”
Here’s my code.
r'''
Summarize a PDF.
From https://www.freedium.cfd/a3c71ff906df
Install these: pip install nltk transformers
This gets error:
pdf_text = file.read().decode('utf-8')
^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf6 in position 10: invalid start byte
'''
from datetime import datetime
import inspect
import os
from os.path import exists
import pprint
import re
import sys
import glob # To get list of files.
import nltk
from transformers import pipeline
pdffile = r'data\_OceanofPDF.com_Tell_Your_Children_-_Alex_Berenson.pdf'
if not exists(pdffile):
print(f"ERROR: File {pdffile} does not exist")
sys.exit()
# First download essential NKTK corpora and models.
print("Downloading models...")
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
#######################################################
def summarize_pdf(file_path):
procname = str(inspect.stack()[0][3]) + ":"
# Read the PDF document
with open(file_path, 'rb') as file:
try:
pdf_text = file.read().decode('utf-8')
except Exception as e:
print(f"{procname} ERROR: {e}")
sys.exit()
# Tokenize the text into sentences
sentences = nltk.sent_tokenize(pdf_text)
# Combine the sentences into a single string
document = ' '.join(sentences)
# Initialize the BART summarization pipeline
summarizer = pipeline('summarization')
# Generate the summary
summary = summarizer(document, max_length=150, min_length=30, do_sample=False)
# Extract the summarized text
summarized_text = summary[0]['summary_text']
return summarized_text
#######################################################
#######################################################
#######################################################
#######################################################
print("Summarize pdf.")
outtext = summarize_pdf(pdffile)
print(outtext)
How do I fix this error?
I’m stumped. Thank you.