Hi Guys, first post so hope this is okay.
I have been working on making a piece of code that locates Company URL’s/Domains and directors names from some pieces of information from companies house/land registry for my work. The directors name section works fine but the url finding is not great. I am not very good at coding and wondered if anyone could help to make this a bit better. the input excel file includes: Title number, Postcode, Address (not always correct), Company Name, Company Reg number, Date of purchase for property, SIC Code and price paid for property. I have got to this at the minute to find them, it does work it is just not as good as i would like it. If anyone can suggest anything to make this better it would be much appreciated, i have removed some of the personal info with XXX:
import pandas as pd
import base64
import requests
from datetime import datetime
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
class EnhancedDomainSpider(scrapy.Spider):
name = 'enhanced_domain_spider'
# Path to the Excel file with company details
excel_path = r"C:\Users\XXX\Desktop\Spider\sample data.xlsx"
# Google Custom Search API key and cx
API_KEY = "XXX"
CX = "XXX"
# Companies House API key
COMPANIES_HOUSE_API_KEY = "XXX"
# Read the Excel file into a pandas DataFrame
df = pd.read_excel(excel_path)
# Caching mechanism for director details
director_cache = {}
def start_requests(self):
# Remove duplicates based on the "Company Registration Number" column
unique_companies_df = self.df.drop_duplicates(subset='Company Registration Number')
for index, row in unique_companies_df.iterrows():
company_name = row['Proprietor Name']
try:
company_number = str(int(row['Company Registration Number'])).zfill(8)
except ValueError:
self.logger.warning(f"Invalid company number for {company_name}, skipping.")
continue
sic_code = row.get('Sic Code', '')
# First try using Companies House API to get the URL
companies_house_url = f"[https://api.company-information.service.gov.uk/company/{company_number}](https://api.company-information.service.gov.uk/company/%7Bcompany_number%7D)"
headers = {"Authorization": f"Basic {base64.b64encode(self.COMPANIES_HOUSE_API_KEY.encode()).decode('utf-8')}"}
# Send request to Companies House using Scrapy's Request object
yield scrapy.Request(url=companies_house_url, headers=headers, callback=self.parse_companies_house,
meta={'company_name': company_name, 'company_number': company_number, 'sic_code': sic_code, 'row': row})
def parse_companies_house(self, response):
company_name = response.meta['company_name']
company_number = response.meta['company_number']
sic_code = response.meta['sic_code']
row = response.meta['row']
if response.status == 404:
self.logger.warning(f"Company number {company_number} not found on Companies House, skipping.")
return
try:
data = response.json()
except Exception as e:
self.logger.error(f"Failed to parse JSON response for {company_name} ({company_number}): {e}")
return
company_url = data.get("website", None)
if company_url and "company-information.service.gov.uk" not in company_url:
first_name, last_name = self.fetch_director_details(company_number)
yield {
'Title Number': row['Title Number'],
'Postcode': row['Postcode'],
'Property Address': row['Property Address'],
'Price Paid': row['Price Paid'],
'Est. Relief': row['Est. Relief'],
'Proprietor Name': company_name,
'Company Registration Number': company_number,
'Date Proprietor Added': row['Date Proprietor Added'],
'Sic Code': sic_code,
'company_url': self.clean_domain(company_url),
'director_first_name': first_name,
'director_last_name': last_name,
'match_status': self.validate_sic_code_match(company_url, sic_code)
}
else:
# Fall back to Google search
search_query = f"{company_name}"
search_url = f"https://www.googleapis.com/customsearch/v1?q={search_query}&key={self.API_KEY}&cx={self.CX}"
yield scrapy.Request(url=search_url, callback=self.parse_google, meta={
'company_name': company_name,
'company_number': company_number,
'sic_code': sic_code,
'search_query': search_query,
'row': row
})
def parse_google(self, response):
company_name = response.meta['company_name']
company_number = response.meta['company_number']
sic_code = response.meta['sic_code']
row = response.meta['row']
try:
data = response.json()
except Exception as e:
self.logger.error(f"Failed to parse JSON response from Google for {company_name} ({company_number}): {e}")
return
company_url = None
best_match_score = 0
if 'items' in data:
for idx, item in enumerate(data['items']):
url = item.get('link', '')
# Skip irrelevant links like PDFs or non-web domains
if self.is_irrelevant_website(url):
continue
# Extract the domain and match with company name
extracted_domain = self.extract_domain(url)
match_score = self.evaluate_url(company_name, extracted_domain, sic_code)
# Prioritize by checking if the domain contains company name directly
if company_name.lower() in extracted_domain.lower():
match_score += 50 # Strong indicator that it is a match
# Prioritize the best match based on score
if match_score > best_match_score:
company_url = extracted_domain
best_match_score = match_score
if not company_url:
company_url = 'No direct website found'
first_name, last_name = self.fetch_director_details(company_number)
yield {
'Title Number': row['Title Number'],
'Postcode': row['Postcode'],
'Property Address': row['Property Address'],
'Price Paid': row['Price Paid'],
'Est. Relief': row['Est. Relief'],
'Proprietor Name': company_name,
'Company Registration Number': company_number,
'Date Proprietor Added': row['Date Proprietor Added'],
'Sic Code': sic_code,
'company_url': company_url,
'director_first_name': first_name,
'director_last_name': last_name,
'match_status': self.validate_sic_code_match(company_url, sic_code)
}
def fetch_director_details(self, company_number):
if company_number in self.director_cache:
return self.director_cache[company_number]
base64_api_key = base64.b64encode(f"{self.COMPANIES_HOUSE_API_KEY}".encode()).decode("utf-8")
url = f"[https://api.company-information.service.gov.uk/company/{company_number}/officers](https://api.company-information.service.gov.uk/company/%7Bcompany_number%7D/officers)"
headers = {"Authorization": f"Basic {base64_api_key}"}
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
except requests.exceptions.RequestException as e:
self.logger.error(f"Failed to fetch director details for company {company_number}: {e}")
return "", ""
data = response.json()
first_name, last_name = self.extract_director_name(data)
self.director_cache[company_number] = (first_name, last_name)
return first_name, last_name
def extract_director_name(self, data):
current_year = datetime.now().year
directors = []
for officer in data.get('items', []):
if officer.get('officer_role') == 'director' and not officer.get('resigned_on'):
dob = officer.get('date_of_birth', {})
if dob:
birth_year = dob.get('year', 0)
age = current_year - birth_year
name = officer['name']
directors.append((name, age))
if directors:
directors.sort(key=lambda x: x[1], reverse=True)
chosen_director = directors[0]
name_parts = chosen_director[0].split(", ")
last_name = name_parts[0].replace("Dr", "").strip()
first_name = name_parts[1].split()[0].replace(",", "") if len(name_parts) > 1 else ''
return first_name, last_name
return "", ""
def is_irrelevant_website(self, url):
irrelevant_domains = [
'wikipedia.org', 'youtube.com', 'facebook.com', 'support.google.com', 'twitter.com', 'linkedin.com',
'lusha.com', 'pomanda.com', 'yell.com', 'indeed.com', 'imdb.com', '.shop'
]
excluded_keywords = ['support', 'careers', 'help', 'contact', 'faq']
excluded_extensions = ['.pdf', '.doc', '.xls', '.mp4', '.zip']
# Check for irrelevant domains
if any(domain in url.lower() for domain in irrelevant_domains):
return True
# Check for irrelevant keywords
if any(keyword in url.lower() for keyword in excluded_keywords):
return True
# Check for irrelevant file extensions
if any(url.lower().endswith(ext) for ext in excluded_extensions):
return True
return False
def evaluate_url(self, company_name, url, sic_code):
score = 0
if self.company_name_matches_url(company_name, url):
score += 70
if self.is_valid_website(url):
score += 30
return score
def is_valid_website(self, url):
domain_patterns = [".com", ".co.uk", ".org", ".net"]
if any(pattern in url.lower() for pattern in domain_patterns):
return True
return False
def company_name_matches_url(self, company_name, url):
parsed_domain = self.extract_domain(url)
company_keywords = company_name.lower().split()
return any(keyword in parsed_domain for keyword in company_keywords)
def extract_domain(self, url):
parsed_url = urlparse(url)
domain = parsed_url.netloc
domain = domain.lstrip("www.")
return domain
def clean_domain(self, url):
domain = self.extract_domain(url)
return domain.split('/')[0]
def validate_sic_code_match(self, company_url, sic_code):
try:
response = requests.get(f"http://{company_url}", timeout=5)
# Limit the size of the response content to prevent excessive downloads
if response.status_code == 200 and len(response.content) < 5 * 1024 * 1024:
soup = BeautifulSoup(response.content, 'html.parser')
text = soup.get_text().lower()
sic_description = sic_code.split(' - ')[1].lower()
match_score = fuzz.partial_ratio(sic_description, text)
if match_score > 60:
return 'Reliable Match'
except requests.exceptions.RequestException:
pass
return 'Likely Match' if company_url != 'No direct website found' else 'No Match Found'```