I have python code of extracting URL AND DISPLAY TEXT WITH URL status from the PDF And as result its give xlxs file of list of urls and it’s display text but if the display text is in multiple line it’s considered as single line as new urls so we don’t want that we required that multiple line display text get in single line.
import tkinter as tk
from tkinter import filedialog, messagebox
from tkinter import ttk
import os
import re
import requests
from openpyxl import Workbook
from openpyxl.styles import Alignment, Font as xlFont, PatternFill
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE
from docx import Document
import fitz # PyMuPDF
import pytesseract
from PIL import Image
import io
Function to combine multi-line text
def combine_multiline_text(text):
lines = text.split('\n')
combined_lines = []
buffer = ""
for line in lines:
if line.strip() == "":
combined_lines.append(buffer.strip())
buffer = ""
else:
buffer += line.strip() + " "
if buffer:
combined_lines.append(buffer.strip())
return "\n".join(combined_lines)
Function to extract URLs and email IDs from text
def extract_urls_from_text(text, page_or_paragraph_num):
urls = []
url_pattern = re.compile(r'(https?://\S+|http://\S+|www\.\S+|mailto:\S+)')
text = combine_multiline_text(text) # Combine multiline text first
matches = re.findall(url_pattern, text)
for match in matches:
urls.append((match.strip(), text.strip(), page_or_paragraph_num)) # Combine multiline display text
return urls
Function to extract URLs from PDF file
def extract_urls_from_pdf(file_path):
pdf_document = fitz.open(file_path)
urls = set()
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
links = page.get_links()
for link in links:
if 'uri' in link:
display_text = ""
if 'title' in link:
display_text = link['title']
else:
rect = fitz.Rect(link["from"])
display_text = page.get_text("text", clip=rect).strip()
if not display_text:
image = page.get_pixmap(clip=rect)
img = Image.open(io.BytesIO(image.tobytes()))
display_text = pytesseract.image_to_string(img).strip()
urls.add((link['uri'].strip(), display_text, page_num + 1))
text = page.get_text("text")
text = combine_multiline_text(text)
urls.update(extract_urls_from_text(text, page_num + 1))
return sorted(list(urls), key=lambda x: x[2])
Function to extract URLs from DOCX file
def extract_urls_from_docx(file_path):
doc = Document(file_path)
urls = set()
for para_num, para in enumerate(doc.paragraphs):
para_text = combine_multiline_text(para.text)
urls.update(extract_urls_from_text(para_text, para_num + 1))
for run in para.runs:
if run.underline and re.match(r'(https?://\S+|http://\S+|www\.\S+|mailto:\S+)', run.text):
urls.add((run.text.strip(), para.text.strip(), para_num + 1))
if run.hyperlink and re.match(r'(https?://\S+|http://\S+|www\.\S+|mailto:\S+)', run.hyperlink.target):
urls.add((run.hyperlink.target.strip(), para.text.strip(), para_num + 1))
return sorted(list(urls), key=lambda x: x[2])
Function to extract URLs from PPTX file
def extract_urls_from_pptx(file_path):
prs = Presentation(file_path)
urls = set()
def extract_url_from_shape(shape, slide_num):
if hasattr(shape, "hyperlink") and shape.hyperlink.address:
urls.add((shape.hyperlink.address.strip(), shape.text if hasattr(shape, "text") else "Image", slide_num + 1))
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
for element in shape.shapes:
if hasattr(element, "hyperlink") and element.hyperlink.address:
urls.add((element.hyperlink.address.strip(), element.text if hasattr(element, "text") else "Image", slide_num + 1))
for slide_num, slide in enumerate(prs.slides):
for shape in slide.shapes:
if shape.shape_type == MSO_SHAPE_TYPE.TEXT_BOX:
textbox_text = ""
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
if run.hyperlink and run.hyperlink.address:
urls.add((run.hyperlink.address.strip(), run.text, slide_num + 1))
else:
textbox_text += run.text
textbox_text = combine_multiline_text(textbox_text)
urls.update(extract_urls_from_text(textbox_text, slide_num + 1))
elif shape.shape_type == MSO_SHAPE_TYPE.GROUP:
extract_url_from_shape(shape, slide_num)
elif hasattr(shape, "hyperlink") and shape.hyperlink.address:
urls.add((shape.hyperlink.address.strip(), "Image", slide_num + 1))
return sorted(list(urls), key=lambda x: x[2])
Function to extract URLs from any supported file type
def extract_urls(file_path):
if file_path.lower().endswith('.pdf'):
return extract_urls_from_pdf(file_path)
elif file_path.lower().endswith('.docx'):
return extract_urls_from_docx(file_path)
elif file_path.lower().endswith('.pptx'):
return extract_urls_from_pptx(file_path)
else:
return []
Function to check the status of URLs
def check_urls(urls):
results = []
for url, display_text, page_or_paragraph_num in urls:
try:
response = requests.get(url, allow_redirects=True, timeout=20)
status = 'Active' if response.status_code == 200 else f'Inactive (Status Code: {response.status_code})'
except requests.RequestException as e:
status = f'Error: {str(e)}'
results.append((url, display_text, status, page_or_paragraph_num))
return results
Function to write results to an Excel file
def write_to_excel(file_path, results, excel_dir):
sorted_results = sorted(results, key=lambda x: (x[3], x[1]))
wb = Workbook()
ws = wb.active
ws.append(["URL", "Display Text", "Status", "Page or Paragraph Number"])
for cell in ws["1:1"]:
cell.font = xlFont(bold=True)
cell.alignment = Alignment(horizontal="center", vertical="center")
for url, display_text, status, page_or_paragraph_num in sorted_results:
ws.append([url, display_text, status, page_or_paragraph_num])
cell = ws.cell(row=ws.max_row, column=1, value=url)
cell.hyperlink = url
cell.style = "Hyperlink"
if 'Inactive' in status or 'Error' in status:
cell.fill = PatternFill(start_color="FFEE1111", end_color="FFEE1111", fill_type="solid")
ws.column_dimensions['A'].width = 60
for col in ['B', 'C', 'D']:
ws.column_dimensions[col].width = 30
for col in ws.columns:
for cell in col:
cell.alignment = Alignment(wrap_text=True, vertical="center")
excel_filename = os.path.join(excel_dir, os.path.splitext(os.path.basename(file_path))[0] + "_URL_Report.xlsx")
wb.save(excel_filename)
return excel_filename
Function to highlight URLs in a PDF file and add comments
def highlight_and_comment_urls_in_pdf(file_path, results):
pdf_document = None
try:
pdf_document = fitz.open(file_path)
for url, display_text, status, page_num in results:
page = pdf_document.load_page(page_num - 1)
links = page.get_links()
for link in links:
if 'uri' in link and link['uri'].strip().lower() == url.lower():
rect = fitz.Rect(link["from"])
highlight = page.add_highlight_annot(rect)
if 'Inactive' in status:
highlight.set_colors({"stroke": (1, 0, 0)})
elif 'Error' in status:
highlight.set_colors({"stroke": (0, 0, 1)})
else:
highlight.set_colors({"stroke": (0, 1, 0)})
highlight.update()
highlight.set_info(title="URL Status", content=f"URL: {url}\nStatus: {status}")
highlighted_pdf_path = os.path.splitext(file_path)[0] + "_highlighted.pdf"
pdf_document.save(highlighted_pdf_path)
return highlighted_pdf_path
except Exception as e:
messagebox.showerror("Error", f"An error occurred: {e}")
if pdf_document:
pdf_document.close()
raise e
Function to process selected files
def process_files(file_paths):
if not file_paths:
return
root_dir = os.path.dirname(file_paths[0])
files_without_urls = []
for file_path in file_paths:
urls = extract_urls(file_path)
if not urls:
files_without_urls.append(os.path.basename(file_path))
continue
results = check_urls(urls)
excel_dir = os.path.join(root_dir, "URL_Reports")
os.makedirs(excel_dir, exist_ok=True)
write_to_excel(file_path, results, excel_dir)
if file_path.lower().endswith('.pdf'):
highlight_and_comment_urls_in_pdf(file_path, results)
if files_without_urls:
txt_path = os.path.join(root_dir, "Files_Without_URLs.txt")
write_to_txt(files_without_urls, txt_path)
messagebox.showinfo("Processing Complete", "URL extraction completed.")
Function to write files without URLs to a text file
def write_to_txt(files_without_urls, txt_path):
with open(txt_path, 'w') as f:
f.write("Files without URLs:\n")
for file_name in files_without_urls:
f.write(file_name + '\n')
Function to compare URLs between two files and highlight URLs in PDFs
def compare_files(file1_path, file2_path):
urls1 = extract_urls(file1_path)
urls2 = extract_urls(file2_path)
results1 = check_urls(urls1)
results2 = check_urls(urls2)
root_dir = os.path.dirname(file1_path)
excel_dir = os.path.join(root_dir, "URL_Reports")
os.makedirs(excel_dir, exist_ok=True)
# Write individual reports for source and target files
file1_report = write_to_excel(file1_path, results1, excel_dir)
file2_report = write_to_excel(file2_path, results2, excel_dir)
# Highlight URLs in PDFs for both source and target files
if file1_path.lower().endswith('.pdf'):
highlight_and_comment_urls_in_pdf(file1_path, results1)
if file2_path.lower().endswith('.pdf'):
highlight_and_comment_urls_in_pdf(file2_path, results2)
# Combine and compare results from both files in sequence
max_len = max(len(results1), len(results2))
combined_results = []
for i in range(max_len):
source_page_or_paragraph = results1[i][3] if i < len(results1) else ""
source_url = results1[i][0].strip().lower() if i < len(results1) else ""
source_display_text = results1[i][1] if i < len(results1) else ""
target_page_or_paragraph = results2[i][3] if i < len(results2) else ""
target_url = results2[i][0].strip().lower() if i < len(results2) else ""
target_display_text = results2[i][1] if i < len(results2) else ""
compare_result = "Match" if source_url == target_url else "Unmatched"
combined_results.append([
source_page_or_paragraph, source_url, source_display_text,
target_page_or_paragraph, target_url, target_display_text, compare_result
])
comparison_filename = os.path.join(excel_dir, "Comparison_Report.xlsx")
wb = Workbook()
ws = wb.active
ws.append([
"Source Page or Paragraph Number", "Source URL", "Source Display Text",
"Target Page or Paragraph Number", "Target URL", "Target Display Text", "Compare"
])
for cell in ws["1:1"]:
cell.font = xlFont(bold=True)
cell.alignment = Alignment(horizontal="center", vertical="center")
for result in combined_results:
ws.append(result)
ws.column_dimensions['B'].width = 60
for col in ['A', 'C', 'D', 'E', 'F', 'G']:
ws.column_dimensions[col].width = 30
for col in ws.columns:
for cell in col:
cell.alignment = Alignment(wrap_text=True, vertical="center")
wb.save(comparison_filename)
messagebox.showinfo("Comparison Complete", f"Comparison report saved as {comparison_filename}")
Custom dialog for selecting source and target files
def custom_file_dialog():
dialog = tk.Toplevel(root)
dialog.title("Select Source and Target Files")
dialog.geometry("600x250")
dialog.resizable(False, False)
dialog.update_idletasks()
width = dialog.winfo_width()
height = dialog.winfo_height()
x = (dialog.winfo_screenwidth() // 2) - (width // 2)
y = (dialog.winfo_screenheight() // 2) - (height // 2)
dialog.geometry(f"{width}x{height}+{x}+{y}")
label = ttk.Label(dialog, text="Select Source and Target Files", font=("Helvetica", 14, "bold"))
label.grid(row=0, column=0, columnspan=3, pady=10)
file1_path_var = tk.StringVar()
file2_path_var = tk.StringVar()
def open_file1_dialog():
file1_path = filedialog.askopenfilename(title="Select Source File", filetypes=[("Supported files", "*.pdf *.docx *.pptx")])
file1_path_var.set(file1_path)
def open_file2_dialog():
file2_path = filedialog.askopenfilename(title="Select Target File", filetypes=[("Supported files", "*.pdf *.docx *.pptx")])
file2_path_var.set(file2_path)
file1_label = ttk.Label(dialog, text="Source File:", font=("Helvetica", 12))
file1_label.grid(row=1, column=0, padx=10, pady=5, sticky=tk.E)
file1_entry = ttk.Entry(dialog, textvariable=file1_path_var, width=50)
file1_entry.grid(row=1, column=1, padx=10, pady=5)
file1_button = ttk.Button(dialog, text="Browse", command=open_file1_dialog)
file1_button.grid(row=1, column=2, padx=10, pady=5)
file2_label = ttk.Label(dialog, text="Target File:", font=("Helvetica", 12))
file2_label.grid(row=2, column=0, padx=10, pady=5, sticky=tk.E)
file2_entry = ttk.Entry(dialog, textvariable=file2_path_var, width=50)
file2_entry.grid(row=2, column=1, padx=10, pady=5)
file2_button = ttk.Button(dialog, text="Browse", command=open_file2_dialog)
file2_button.grid(row=2, column=2, padx=10, pady=5)
def on_generate_report():
dialog.destroy()
generate_button = ttk.Button(dialog, text="Generate Report", command=on_generate_report, style="Generate.TButton")
generate_button.grid(row=3, column=0, columnspan=3, pady=10)
dialog.transient(root)
dialog.grab_set()
root.wait_window(dialog)
return file1_path_var.get(), file2_path_var.get()
Function to select files for comparison
def select_files_for_comparison():
file1_path, file2_path = custom_file_dialog()
if not file1_path or not file2_path:
return
compare_files(file1_path, file2_path)
Function to select files for URL extraction
def select_files():
file_paths = filedialog.askopenfilenames(title="Select Files", filetypes=[("Supported files", "*.pdf *.docx *.pptx")])
process_files(file_paths)
Function to highlight URLs in selected PDF files
def highlight_pdfs():
file_paths = filedialog.askopenfilenames(title="Select PDF Files for Highlighting", filetypes=[("PDF files", "*.pdf")])
if not file_paths:
return
for file_path in file_paths:
urls = extract_urls(file_path)
results = check_urls(urls)
highlight_and_comment_urls_in_pdf(file_path, results)
messagebox.showinfo("Highlighting Complete", "Highlighting of URLs in PDFs completed.")
Setup GUI
root = tk.Tk()
root.title(“URL Extractor and Comparator”)
root.geometry(“400x300”)
root.resizable(False, False)
root.update_idletasks()
width = root.winfo_width()
height = root.winfo_height()
x = (root.winfo_screenwidth() // 2) - (width // 2)
y = (root.winfo_screenheight() // 2) - (height // 2)
root.geometry(f"{width}x{height}+{x}+{y}")
style = ttk.Style()
style.configure(“TButton”, font=(“Helvetica”, 12))
style.configure(“Accent.TButton”, foreground=“black”, background=“#4CAF50”, font=(“Helvetica”, 12, “bold”))
style.map(“Accent.TButton”, background=[(“active”, “#45a049”), (“disabled”, “#A9A9A9”)])
style.configure(“Generate.TButton”, foreground=“black”, background=“#FF5733”, font=(“Helvetica”, 12, “bold”))
style.map(“Generate.TButton”, background=[(“active”, “#E64A19”), (“disabled”, “#A9A9A9”)])
frame = ttk.Frame(root, padding=“20”)
frame.grid(row=0, column=0, sticky=“nsew”)
root.grid_rowconfigure(0, weight=1)
root.grid_columnconfigure(0, weight=1)
Center the content in the frame
frame.grid_rowconfigure(0, weight=1)
frame.grid_rowconfigure(1, weight=1)
frame.grid_rowconfigure(2, weight=1)
frame.grid_rowconfigure(3, weight=1)
frame.grid_columnconfigure(0, weight=1)
label = ttk.Label(frame, text=“URL Extractor and Comparator”, font=(“Arial”, 16))
label.grid(row=0, column=0, pady=5, sticky=“n”)
extract_button = ttk.Button(frame, text=“Select Files for URL Extraction”, command=select_files, style=“Accent.TButton”)
extract_button.grid(row=1, column=0, pady=5)
compare_button = ttk.Button(frame, text=“Select Files for Comparison”, command=select_files_for_comparison, style=“Accent.TButton”)
compare_button.grid(row=2, column=0, pady=5)
highlight_button = ttk.Button(frame, text=“Highlight URLs in PDFs”, command=highlight_pdfs, style=“Accent.TButton”)
highlight_button.grid(row=3, column=0, pady=5)
root.mainloop()