Hey Chris! Thanks for your answer.
I’ve been working on my code for some more, many things that needed to be improved. This is the code as of right now:
import csv
import requests
import scrapy
import ssl
import re
from scrapy.crawler import CrawlerRunner
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.utils.project import get_project_settings
from twisted.internet import reactor, defer
from urllib.robotparser import RobotFileParser
from urllib.parse import urlparse
# Disable SSL certificate verification
ssl._create_default_https_context = ssl._create_unverified_context
def is_scraping_allowed(url):
# Check if scraping is allowed for the given URL
robots_url = f"{url}/robots.txt"
response = requests.get(robots_url)
if response.status_code == 200:
robots_txt = response.text.lower()
if "user-agent: *" in robots_txt and "disallow: /" in robots_txt:
return False
response = requests.get(url, timeout=5)
if response.status_code == 200:
text = response.text.lower()
relevant_pages = ["agb", "impressum", "datenschutzerklärung"]
first_group_phrases = ["crawlen", "scrapen"]
second_group_phrases = ["verboten", "nicht gestattet", "nicht erlaubt"]
for page in relevant_pages:
if page in text:
sentences = re.split(r"[.!?]", text[text.index(page):])
for sentence in sentences:
if any(phrase in sentence for phrase in first_group_phrases) and any(
phrase in sentence for phrase in second_group_phrases):
return False
return True
class PageChecker(CrawlSpider):
name = 'WebsiteAnalysis'
custom_settings = {
'DOWNLOAD_DELAY': 1,
'CONCURRENT_REQUESTS': 4,
'USER_AGENT': 'Mozilla/5.0 (compatible; WebsiteAnalysis/1.0; +http://www.webdesignowl.com)',
'LOG_LEVEL': 'ERROR'
}
def __init__(self, websites=None, results=None, **kwargs):
self.start_urls = websites
self.allowed_domains = [self.extract_domain(url) for url in websites]
self.rules = [Rule(LinkExtractor(allow=()), callback=self.parse_page, follow=True)]
self.results = results if results is not None else {} # Track results for each URL
self.pages_found = [] # Track all pages found during crawl
self.robot_parser = RobotFileParser() # RobotFileParser object
super().__init__(**kwargs)
@staticmethod
def extract_domain(url):
return urlparse(url).netloc
def parse_page(self, response):
status_code = response.status
if status_code in [404, 410]:
self.results.setdefault(response.url, True)
self.pages_found.append((response.url, status_code))
def closed(self, reason):
save_to_csv(self.results, self.pages_found)
def process_request(self, request):
url = request.url
domain = self.extract_domain(url)
self.robot_parser.set_url(f"https://{domain}/robots.txt")
self.robot_parser.read()
if not self.robot_parser.can_fetch("*", url):
self.crawler.stats.inc_value("request/filtered")
self.results.setdefault(url, "Scraping not allowed")
raise scrapy.exceptions.IgnoreRequest(f"Scraping not allowed by robots.txt: {url}")
def check_inactive_pages(websites):
results = {}
pages_found = []
runner = CrawlerRunner(get_project_settings())
if websites:
crawl_deferred_list = []
for website in websites:
scraping_allowed = is_scraping_allowed(website) # Check if scraping is allowed for the website
if scraping_allowed is False:
results.setdefault(website, "Scraping not allowed")
elif isinstance(scraping_allowed, int):
results[website] = {'pages_found': []} # Track results for the current website
d = runner.crawl(PageChecker, websites=[website], results=results[website])
crawl_deferred_list.append(d)
# Call save_to_csv after all the crawls are complete
defer.DeferredList(crawl_deferred_list).addBoth(lambda _: save_to_csv(results, pages_found))
else:
print("No websites to process.")
return
# Run the reactor to execute the crawls
reactor.run()
def save_to_csv(results, pages_found):
if results or pages_found:
csv_file_path = 'inactive_pages.csv'
with open(csv_file_path, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['URL', 'Status'])
for url, status in results.items():
if status is True:
writer.writerow([url, '404 pages found on website'])
elif status == "Scraping not allowed":
writer.writerow([url, 'Scraping not allowed'])
elif isinstance(status, int):
writer.writerow([url, 'No 404 pages found on website'])
else:
writer.writerow([url, 'Website can\'t be reached'])
if pages_found:
writer.writerow([]) # Remove the empty row
writer.writerow(['Page', 'Status Code'])
for page, status_code in pages_found:
writer.writerow([page, status_code])
print("CSV file created successfully.")
websites = [
"https://webdesignowl.com",
"https://praxismuellerschulz.de",
"https://korfugrilldetmold.de",
"https://tierarztpraxis-idw-naurath.de"
]
# Check for inactive pages (404 or 410) on the websites and save the results to a CSV file
check_inactive_pages(websites)
What I want is this:
Software
Enter websites
Software checks website if crawling forbidden or not
If allowed, crawls every page on website, looks for 404/410 pages that were once present on the website (status code 200)
Creates CSV.
Two tables. Both two columns. Table 1 Left C: All websites one entered, Right C: EITHER “404 pages found on website”, “no 404 pages found on website”, “Scraping not allowed” or “Website can’t be reached”. Remember, the entire websites are supposed to be crawled for 404 pages. So in the second table in the left column all pages that were found. right column status code. This second table in the csv is so that I can Make sure the program did or didnt find 404 pages.
and what is happening right now is this:
the code when running, creates one file with all pages it finds to the first website I enter, then when done with crawling that website, creates another file with the same name in the same directory, overwriting the old one, where all the pages of the second website I entered are, then the third, and when its done it creates one last file where all the websites that were entered are shown in a table in the left column but for some reason it says “website cant be reached” for them in the right column although like I just said all the pages of the websites were found in the files created before. First two files are just the second table so to say but only for one website, the last file is only the first table so to say but not correct.
Thanks for helping in advance!