Required a help in the code logic

manoherumarathkar74 · June 25, 2024, 6:13am

I have python code of extracting URL AND DISPLAY TEXT WITH URL status from the PDF And as result its give xlxs file of list of urls and it’s display text but if the display text is in multiple line it’s considered as single line as new urls so we don’t want that we required that multiple line display text get in single line.

import tkinter as tk

from tkinter import filedialog, messagebox

from tkinter import ttk

import os

import re

import requests

from openpyxl import Workbook

from openpyxl.styles import Alignment, Font as xlFont, PatternFill

from pptx import Presentation

from pptx.enum.shapes import MSO_SHAPE_TYPE

from docx import Document

import fitz # PyMuPDF

import pytesseract

from PIL import Image

import io

Function to combine multi-line text

def combine_multiline_text(text):

lines = text.split('\n')

combined_lines = []

buffer = ""

for line in lines:

    if line.strip() == "":

        combined_lines.append(buffer.strip())

        buffer = ""

    else:

        buffer += line.strip() + " "

if buffer:

    combined_lines.append(buffer.strip())

return "\n".join(combined_lines)

Function to extract URLs and email IDs from text

def extract_urls_from_text(text, page_or_paragraph_num):

urls = []

url_pattern = re.compile(r'(https?://\S+|http://\S+|www\.\S+|mailto:\S+)')

text = combine_multiline_text(text)  # Combine multiline text first

matches = re.findall(url_pattern, text)

for match in matches:

    urls.append((match.strip(), text.strip(), page_or_paragraph_num))  # Combine multiline display text

return urls

Function to extract URLs from PDF file

def extract_urls_from_pdf(file_path):

pdf_document = fitz.open(file_path)

urls = set()

for page_num in range(len(pdf_document)):

    page = pdf_document.load_page(page_num)

    links = page.get_links()

    for link in links:

        if 'uri' in link:

            display_text = ""

            if 'title' in link:

                display_text = link['title']

            else:

                rect = fitz.Rect(link["from"])

                display_text = page.get_text("text", clip=rect).strip()

                if not display_text:

                    image = page.get_pixmap(clip=rect)

                    img = Image.open(io.BytesIO(image.tobytes()))

                    display_text = pytesseract.image_to_string(img).strip()

            urls.add((link['uri'].strip(), display_text, page_num + 1))

    text = page.get_text("text")

    text = combine_multiline_text(text)

    urls.update(extract_urls_from_text(text, page_num + 1))

return sorted(list(urls), key=lambda x: x[2])

Function to extract URLs from DOCX file

def extract_urls_from_docx(file_path):

doc = Document(file_path)

urls = set()

for para_num, para in enumerate(doc.paragraphs):

    para_text = combine_multiline_text(para.text)

    urls.update(extract_urls_from_text(para_text, para_num + 1))

    for run in para.runs:

        if run.underline and re.match(r'(https?://\S+|http://\S+|www\.\S+|mailto:\S+)', run.text):

            urls.add((run.text.strip(), para.text.strip(), para_num + 1))

        if run.hyperlink and re.match(r'(https?://\S+|http://\S+|www\.\S+|mailto:\S+)', run.hyperlink.target):

            urls.add((run.hyperlink.target.strip(), para.text.strip(), para_num + 1))

return sorted(list(urls), key=lambda x: x[2])

Function to extract URLs from PPTX file

def extract_urls_from_pptx(file_path):

prs = Presentation(file_path)

urls = set()



def extract_url_from_shape(shape, slide_num):

    if hasattr(shape, "hyperlink") and shape.hyperlink.address:

        urls.add((shape.hyperlink.address.strip(), shape.text if hasattr(shape, "text") else "Image", slide_num + 1))

    if shape.shape_type == MSO_SHAPE_TYPE.GROUP:

        for element in shape.shapes:

            if hasattr(element, "hyperlink") and element.hyperlink.address:

                urls.add((element.hyperlink.address.strip(), element.text if hasattr(element, "text") else "Image", slide_num + 1))



for slide_num, slide in enumerate(prs.slides):

    for shape in slide.shapes:

        if shape.shape_type == MSO_SHAPE_TYPE.TEXT_BOX:

            textbox_text = ""

            for paragraph in shape.text_frame.paragraphs:

                for run in paragraph.runs:

                    if run.hyperlink and run.hyperlink.address:

                        urls.add((run.hyperlink.address.strip(), run.text, slide_num + 1))

                    else:

                        textbox_text += run.text

            textbox_text = combine_multiline_text(textbox_text)

            urls.update(extract_urls_from_text(textbox_text, slide_num + 1))

        elif shape.shape_type == MSO_SHAPE_TYPE.GROUP:

            extract_url_from_shape(shape, slide_num)

        elif hasattr(shape, "hyperlink") and shape.hyperlink.address:

            urls.add((shape.hyperlink.address.strip(), "Image", slide_num + 1))



return sorted(list(urls), key=lambda x: x[2])

Function to extract URLs from any supported file type

def extract_urls(file_path):

if file_path.lower().endswith('.pdf'):

    return extract_urls_from_pdf(file_path)

elif file_path.lower().endswith('.docx'):

    return extract_urls_from_docx(file_path)

elif file_path.lower().endswith('.pptx'):

    return extract_urls_from_pptx(file_path)

else:

    return []

Function to check the status of URLs

def check_urls(urls):

results = []

for url, display_text, page_or_paragraph_num in urls:

    try:

        response = requests.get(url, allow_redirects=True, timeout=20)

        status = 'Active' if response.status_code == 200 else f'Inactive (Status Code: {response.status_code})'

    except requests.RequestException as e:

        status = f'Error: {str(e)}'

    results.append((url, display_text, status, page_or_paragraph_num))

return results

Function to write results to an Excel file

def write_to_excel(file_path, results, excel_dir):

sorted_results = sorted(results, key=lambda x: (x[3], x[1]))



wb = Workbook()

ws = wb.active

ws.append(["URL", "Display Text", "Status", "Page or Paragraph Number"])



for cell in ws["1:1"]:

    cell.font = xlFont(bold=True)

    cell.alignment = Alignment(horizontal="center", vertical="center")



for url, display_text, status, page_or_paragraph_num in sorted_results:

    ws.append([url, display_text, status, page_or_paragraph_num])

    cell = ws.cell(row=ws.max_row, column=1, value=url)

    cell.hyperlink = url

    cell.style = "Hyperlink"

    if 'Inactive' in status or 'Error' in status:

        cell.fill = PatternFill(start_color="FFEE1111", end_color="FFEE1111", fill_type="solid")



ws.column_dimensions['A'].width = 60

for col in ['B', 'C', 'D']:

    ws.column_dimensions[col].width = 30



for col in ws.columns:

    for cell in col:

        cell.alignment = Alignment(wrap_text=True, vertical="center")



excel_filename = os.path.join(excel_dir, os.path.splitext(os.path.basename(file_path))[0] + "_URL_Report.xlsx")

wb.save(excel_filename)

return excel_filename

Function to highlight URLs in a PDF file and add comments

def highlight_and_comment_urls_in_pdf(file_path, results):

pdf_document = None

try:

    pdf_document = fitz.open(file_path)

    for url, display_text, status, page_num in results:

        page = pdf_document.load_page(page_num - 1)

        links = page.get_links()

        for link in links:

            if 'uri' in link and link['uri'].strip().lower() == url.lower():

                rect = fitz.Rect(link["from"])

                highlight = page.add_highlight_annot(rect)

                if 'Inactive' in status:

                    highlight.set_colors({"stroke": (1, 0, 0)})

                elif 'Error' in status:

                    highlight.set_colors({"stroke": (0, 0, 1)})

                else:

                    highlight.set_colors({"stroke": (0, 1, 0)})

                highlight.update()

                highlight.set_info(title="URL Status", content=f"URL: {url}\nStatus: {status}")

    highlighted_pdf_path = os.path.splitext(file_path)[0] + "_highlighted.pdf"

    pdf_document.save(highlighted_pdf_path)

    return highlighted_pdf_path

except Exception as e:

    messagebox.showerror("Error", f"An error occurred: {e}")

    if pdf_document:

        pdf_document.close()

    raise e

Function to process selected files

def process_files(file_paths):

if not file_paths:

    return



root_dir = os.path.dirname(file_paths[0])

files_without_urls = []



for file_path in file_paths:

    urls = extract_urls(file_path)

    if not urls:

        files_without_urls.append(os.path.basename(file_path))

        continue

    results = check_urls(urls)

    excel_dir = os.path.join(root_dir, "URL_Reports")

    os.makedirs(excel_dir, exist_ok=True)

    write_to_excel(file_path, results, excel_dir)

    if file_path.lower().endswith('.pdf'):

        highlight_and_comment_urls_in_pdf(file_path, results)



if files_without_urls:

    txt_path = os.path.join(root_dir, "Files_Without_URLs.txt")

    write_to_txt(files_without_urls, txt_path)



messagebox.showinfo("Processing Complete", "URL extraction completed.")

Function to write files without URLs to a text file

def write_to_txt(files_without_urls, txt_path):

with open(txt_path, 'w') as f:

    f.write("Files without URLs:\n")

    for file_name in files_without_urls:

        f.write(file_name + '\n')

Function to compare URLs between two files and highlight URLs in PDFs

def compare_files(file1_path, file2_path):

urls1 = extract_urls(file1_path)

urls2 = extract_urls(file2_path)



results1 = check_urls(urls1)

results2 = check_urls(urls2)



root_dir = os.path.dirname(file1_path)

excel_dir = os.path.join(root_dir, "URL_Reports")

os.makedirs(excel_dir, exist_ok=True)



# Write individual reports for source and target files

file1_report = write_to_excel(file1_path, results1, excel_dir)

file2_report = write_to_excel(file2_path, results2, excel_dir)



# Highlight URLs in PDFs for both source and target files

if file1_path.lower().endswith('.pdf'):

    highlight_and_comment_urls_in_pdf(file1_path, results1)

if file2_path.lower().endswith('.pdf'):

    highlight_and_comment_urls_in_pdf(file2_path, results2)



# Combine and compare results from both files in sequence

max_len = max(len(results1), len(results2))

combined_results = []



for i in range(max_len):

    source_page_or_paragraph = results1[i][3] if i < len(results1) else ""

    source_url = results1[i][0].strip().lower() if i < len(results1) else ""

    source_display_text = results1[i][1] if i < len(results1) else ""

   

    target_page_or_paragraph = results2[i][3] if i < len(results2) else ""

    target_url = results2[i][0].strip().lower() if i < len(results2) else ""

    target_display_text = results2[i][1] if i < len(results2) else ""

   

    compare_result = "Match" if source_url == target_url else "Unmatched"

   

    combined_results.append([

        source_page_or_paragraph, source_url, source_display_text,

        target_page_or_paragraph, target_url, target_display_text, compare_result

    ])



comparison_filename = os.path.join(excel_dir, "Comparison_Report.xlsx")

wb = Workbook()

ws = wb.active

ws.append([

    "Source Page or Paragraph Number", "Source URL", "Source Display Text",

    "Target Page or Paragraph Number", "Target URL", "Target Display Text", "Compare"

])



for cell in ws["1:1"]:

    cell.font = xlFont(bold=True)

    cell.alignment = Alignment(horizontal="center", vertical="center")



for result in combined_results:

    ws.append(result)



ws.column_dimensions['B'].width = 60

for col in ['A', 'C', 'D', 'E', 'F', 'G']:

    ws.column_dimensions[col].width = 30



for col in ws.columns:

    for cell in col:

        cell.alignment = Alignment(wrap_text=True, vertical="center")



wb.save(comparison_filename)



messagebox.showinfo("Comparison Complete", f"Comparison report saved as {comparison_filename}")

Custom dialog for selecting source and target files

def custom_file_dialog():

dialog = tk.Toplevel(root)

dialog.title("Select Source and Target Files")

dialog.geometry("600x250")

dialog.resizable(False, False)



dialog.update_idletasks()

width = dialog.winfo_width()

height = dialog.winfo_height()

x = (dialog.winfo_screenwidth() // 2) - (width // 2)

y = (dialog.winfo_screenheight() // 2) - (height // 2)

dialog.geometry(f"{width}x{height}+{x}+{y}")



label = ttk.Label(dialog, text="Select Source and Target Files", font=("Helvetica", 14, "bold"))

label.grid(row=0, column=0, columnspan=3, pady=10)



file1_path_var = tk.StringVar()

file2_path_var = tk.StringVar()



def open_file1_dialog():

    file1_path = filedialog.askopenfilename(title="Select Source File", filetypes=[("Supported files", "*.pdf *.docx *.pptx")])

    file1_path_var.set(file1_path)



def open_file2_dialog():

    file2_path = filedialog.askopenfilename(title="Select Target File", filetypes=[("Supported files", "*.pdf *.docx *.pptx")])

    file2_path_var.set(file2_path)



file1_label = ttk.Label(dialog, text="Source File:", font=("Helvetica", 12))

file1_label.grid(row=1, column=0, padx=10, pady=5, sticky=tk.E)

file1_entry = ttk.Entry(dialog, textvariable=file1_path_var, width=50)

file1_entry.grid(row=1, column=1, padx=10, pady=5)

file1_button = ttk.Button(dialog, text="Browse", command=open_file1_dialog)

file1_button.grid(row=1, column=2, padx=10, pady=5)



file2_label = ttk.Label(dialog, text="Target File:", font=("Helvetica", 12))

file2_label.grid(row=2, column=0, padx=10, pady=5, sticky=tk.E)

file2_entry = ttk.Entry(dialog, textvariable=file2_path_var, width=50)

file2_entry.grid(row=2, column=1, padx=10, pady=5)

file2_button = ttk.Button(dialog, text="Browse", command=open_file2_dialog)

file2_button.grid(row=2, column=2, padx=10, pady=5)



def on_generate_report():

    dialog.destroy()



generate_button = ttk.Button(dialog, text="Generate Report", command=on_generate_report, style="Generate.TButton")

generate_button.grid(row=3, column=0, columnspan=3, pady=10)



dialog.transient(root)

dialog.grab_set()

root.wait_window(dialog)



return file1_path_var.get(), file2_path_var.get()

Function to select files for comparison

def select_files_for_comparison():

file1_path, file2_path = custom_file_dialog()

if not file1_path or not file2_path:

    return

compare_files(file1_path, file2_path)

Function to select files for URL extraction

def select_files():

file_paths = filedialog.askopenfilenames(title="Select Files", filetypes=[("Supported files", "*.pdf *.docx *.pptx")])

process_files(file_paths)

Function to highlight URLs in selected PDF files

def highlight_pdfs():

file_paths = filedialog.askopenfilenames(title="Select PDF Files for Highlighting", filetypes=[("PDF files", "*.pdf")])

if not file_paths:

    return



for file_path in file_paths:

    urls = extract_urls(file_path)

    results = check_urls(urls)

    highlight_and_comment_urls_in_pdf(file_path, results)



messagebox.showinfo("Highlighting Complete", "Highlighting of URLs in PDFs completed.")

Setup GUI

root = tk.Tk()

root.title(“URL Extractor and Comparator”)

root.geometry(“400x300”)

root.resizable(False, False)

root.update_idletasks()

width = root.winfo_width()

height = root.winfo_height()

x = (root.winfo_screenwidth() // 2) - (width // 2)

y = (root.winfo_screenheight() // 2) - (height // 2)

root.geometry(f"{width}x{height}+{x}+{y}")

style = ttk.Style()

style.configure(“TButton”, font=(“Helvetica”, 12))

style.configure(“Accent.TButton”, foreground=“black”, background=“#4CAF50”, font=(“Helvetica”, 12, “bold”))

style.map(“Accent.TButton”, background=[(“active”, “#45a049”), (“disabled”, “#A9A9A9”)])

style.configure(“Generate.TButton”, foreground=“black”, background=“#FF5733”, font=(“Helvetica”, 12, “bold”))

style.map(“Generate.TButton”, background=[(“active”, “#E64A19”), (“disabled”, “#A9A9A9”)])

frame = ttk.Frame(root, padding=“20”)

frame.grid(row=0, column=0, sticky=“nsew”)

root.grid_rowconfigure(0, weight=1)

root.grid_columnconfigure(0, weight=1)

Center the content in the frame

frame.grid_rowconfigure(0, weight=1)

frame.grid_rowconfigure(1, weight=1)

frame.grid_rowconfigure(2, weight=1)

frame.grid_rowconfigure(3, weight=1)

frame.grid_columnconfigure(0, weight=1)

label = ttk.Label(frame, text=“URL Extractor and Comparator”, font=(“Arial”, 16))

label.grid(row=0, column=0, pady=5, sticky=“n”)

extract_button = ttk.Button(frame, text=“Select Files for URL Extraction”, command=select_files, style=“Accent.TButton”)

extract_button.grid(row=1, column=0, pady=5)

compare_button = ttk.Button(frame, text=“Select Files for Comparison”, command=select_files_for_comparison, style=“Accent.TButton”)

compare_button.grid(row=2, column=0, pady=5)

highlight_button = ttk.Button(frame, text=“Highlight URLs in PDFs”, command=highlight_pdfs, style=“Accent.TButton”)

highlight_button.grid(row=3, column=0, pady=5)

root.mainloop()

blhsing · June 25, 2024, 6:59am

So what is your question? In what way is the code not behaving as expected?

manoherumarathkar74 · June 25, 2024, 8:25am

I want to extract the mult line display text into single line

blhsing · June 25, 2024, 8:33am

Yes, you’ve stated that, but you haven’t told us what issues you’re having.

manoherumarathkar74 · June 26, 2024, 6:31am

While extracting the URLs list from the PDF its its extracting the URLS with display text. But in the display text it’s not taken full sentence or complete display text into single line if the text breaks in the two line.

cameron · June 26, 2024, 8:54am

This is likely because links in PDFs are more like little sticky notes
stuck on the page after rendering instead of like HTML where the link is
a tag surrounding the display text. Ugh.

So if you look at your code, you have to get the display text by getting
the link, then looking at the Rect associated with the link, which is
a rectangle where the display text is drawn on the page. And then OCRing
that part of the page. O_o gah!

When the display text is folded over a line ending you need more than
one rectangle to cover it, one for the line ending and one for the
beginning of the next line. Or more if the text is even longer.

According to my copy of the PDF32000_2008.pdf spec, section 12.5.6.5
describes Link Annotations as having a QuadPoints field in the
dictionary:

 (Optional; PDF 1.6) An array of 8 × n numbers specifying the
 coordinates of n quadrilaterals in default user space that
 comprise the region in which the link should be activated. The
 coordinates for each quadrilateral are given in the order

 x1 y1 x2 y2 x3 y3 x4 y4

 specifying the four vertices of the quadrilateral in counterclockwise
 order. For orientation purposes, such as when applying an
 underline border style, the bottom of a quadrilateral is the
 line formed by (x1 , y1) and (x2 , y2).

 If this entry is not present or the conforming reader does not
 recognize it, the region specified by the Rect entry should be
 used.  QuadPoints shall be ignored if any coordinate in the
 array lies outside the region specified by Rect.

This would let you get the multiple rectangles covering the display text
and OCR each and join them together. I have no idea if the fitz module
lets you get at this.

The PyMuPDF docs:

suggest that the xref for the link is available - maybe that lets you
go from the link to the raw dictionary which defines it and would
presumably have the QuadPoints field.

Or… (this just occurred to me) maybe there are just multiple links.
Maybe of they are adjacent and have the same URL you might string their
text together. Feels even more hacky, and may not work anyway.