Hi! So I have been intrigued to code something that scrapes a Clash Royale website to pull information based on different clans within the game and then compile it within an excel spreadsheet. I am looking to up the stakes and pull more information from each player, within the aforementioned clans, and display their war statistics.
More or less what I have at the moment is a script that scrapes each clan, and outputs an excel spreadsheet with every member from each clan and also categorises it accordingly.
My problem lies within gathering their war statistics since this requires selenium to load javascript on the website (I AM VERY NEW TO PYTHON BTW SO EVEN GETTING THIS TO WORK WAS A HEADACHE LOL). Fortunately, I seem to be not far off the main goal, however, I am truly taken aback on how to get this all to work properly.
So what I have got at the moment, 2 scripts:
- Within the first script, lets call it “clan_data_scraper.py” I have scraped the clan data; displaying all members within each clan.
- Regarding the second script, lets call it “player_war_scraper.py” I have made it so the command line requests a player_tag input to display their war statistics.
The player_tag is a unique identifier within Clash Royale as its assigned to each individuals account.
WHAT I WANT TO DO
How would I go about linking the 2 scripts together? So what I want to do, is when I run the clan_data_scraper.py script, I would like it to generate the information I already have AS WELL AS output another 7 additional excel files, labelled “Aftermath” “Aftershock” “Afterlife” “Afterglow” “Afterparty” “Afterburn” and “Aftertaste”, pretty much all of the clan names, lets say for example, with Aftermath, within this file, I would like the player_war_scraper.py script to have also been implemented within the clan_data_scraper.py script, so that I have all the relevant player war statistics within each sheet, and to have each sheet labelled with their Name and player_tag. I understand the “Aftermath” file will have somewhere of up to 50 sheets, since that is the max amount of players a clan can have, however, I would like it to generate based on how many players are in the clan exactly.
Hopefully this makes sense, below are the 2 scripts I have created:
1st Script: clan_data_scraper.py
import re
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill, Border, Side
from openpyxl.drawing.image import Image as xlImage # Rename Image to xlImage
from datetime import datetime
# List of clan URLs with corresponding names
clan_urls = [
('https://statsroyale.com/clan/8CYR2V', 'Aftermath'),
('https://statsroyale.com/clan/YC8R0RJ0', 'Aftershock'),
('https://statsroyale.com/clan/L2RRRGVG', 'Afterlife'),
('https://statsroyale.com/clan/LUYLLQJR', 'Afterglow'),
('https://statsroyale.com/clan/Q0JRGC22', 'Afterparty'),
('https://statsroyale.com/clan/G22P0PUG', 'Aftertaste'),
('https://statsroyale.com/clan/QY920UQR', 'Afterburn'),
# Add more clan URLs as needed
]
# Define column widths
column_widths = {'A': 13, 'B': 25, 'C': 5, 'D': 8, 'E': 20, 'G': 13, 'H': 20}
# Create a new workbook
wb = Workbook()
# Load the image
img_path = "C:\\Users\\mmoor\\Desktop\\Python Script\\button.png"
async def fetch_clan_data(session, url, clan_name):
async with session.get(url) as response:
content = await response.text()
return clan_name, content
async def fetch_all_clan_data():
async with aiohttp.ClientSession() as session:
tasks = [fetch_clan_data(session, url, clan_name) for url, clan_name in clan_urls]
return await asyncio.gather(*tasks)
async def main():
clan_data = await fetch_all_clan_data()
# Iterate over each clan's data
for clan_name, content in clan_data:
soup = BeautifulSoup(content, 'lxml')
# Find the divs containing headers and player rows
headers_div = soup.find('div', class_='clan__headers')
rows_divs = soup.find_all('div', class_='clan__rowContainer')
# Extract header names
headers = ['Player Tag', 'Name', 'Level', 'Trophies', 'Player Page']
# Extract player data for Name, Level, Trophies, and Player Page
players_data = []
for row_div in rows_divs:
player_data = [data.get_text(strip=True) for index, data in enumerate(row_div.find_all('div')) if index in [1, 2]]
trophies = row_div.find('div', class_='clan__cup')
player_data.append(trophies.get_text(strip=True) if trophies else '') # If trophies exist, append them, otherwise append an empty string
# Extract player tag from the profile link
profile_link = row_div.find('a', class_='ui__blueLink')['href']
player_tag = re.findall(r'[\w]+$', profile_link)[0] # Extract player tag from the href link
player_data.insert(0, player_tag) # Insert player tag
# Construct the royaleapi.com player page link
player_page_link = f'https://royaleapi.com/player/{player_tag}'
player_data.append(player_page_link) # Append player page link
# Replace emojis with spaces in the name using regular expression
player_data[1] = re.sub(r'[^\w\s]', ' ', player_data[1])
players_data.append(player_data)
# Create a new worksheet for the current clan with the clan name
ws = wb.create_sheet(title=clan_name)
# Write data to the worksheet
ws.append(headers)
for idx, player in enumerate(players_data, start=2): # Start from the second row to account for headers
# Apply formatting to Player Page column
player[-1] = f'=HYPERLINK("{player[-1]}", "View on RoyaleAPI")'
# Write row to worksheet
ws.append(player)
# Apply bold formatting to headers
for row in ws.iter_rows(min_row=1, max_row=1):
for cell in row:
cell.font = Font(size=9)
cell.alignment = Alignment(horizontal='center', vertical='center')
# Apply formatting to "View on RoyaleAPI" hyperlink text
for row in ws.iter_rows(min_row=2, max_row=ws.max_row, min_col=5, max_col=5):
for cell in row:
cell.font = Font(color='31869B', underline='single')
# Merge Player Page header
ws.merge_cells('E1:E1')
ws['E1'].alignment = Alignment(horizontal='center', vertical='center')
# Adjust column widths
for col, width in column_widths.items():
ws.column_dimensions[col].width = width
# Align all cells in column E to middle and center
for cell in ws['E']:
cell.alignment = Alignment(horizontal='center', vertical='center')
# Align all cells in column C and D to middle and center
for col in ['C', 'D']:
for cell in ws[col]:
cell.alignment = Alignment(horizontal='center', vertical='center')
# Apply alternating row fill colors
for i, row in enumerate(ws.iter_rows(min_row=2, max_row=ws.max_row, min_col=1, max_col=5), start=2):
if i % 2 == 0:
fill_color = 'FFFFFF' # White color
else:
fill_color = 'D9D9D9' # Light gray color
for cell in row:
cell.fill = PatternFill(start_color=fill_color, end_color=fill_color, fill_type='solid')
# Add borders to all cells with color #BFBFBF
border = Border(left=Side(border_style='thin', color='BFBFBF'),
right=Side(border_style='thin', color='BFBFBF'),
top=Side(border_style='thin', color='BFBFBF'),
bottom=Side(border_style='thin', color='BFBFBF'))
for row in ws.iter_rows():
for cell in row:
cell.border = border
# Display clan information
members_count = len(players_data)
last_updated = datetime.now().strftime("%d/%m/%Y | %H:%M:%S") # Corrected date and time format
ws['G2'] = f'Clan: '
ws['H2'] = f'{clan_name}'
ws['G3'] = f'Members: '
ws['H3'] = f'{members_count}'
ws['G4'] = f'Last Updated: '
ws['H4'] = f'{last_updated}'
# Apply formatting to clan information
info_cells = ['G2', 'H2', 'G3', 'H3', 'G4', 'H4']
for cell in info_cells:
ws[cell].font = Font(size=11)
ws[cell].alignment = Alignment(horizontal='right', vertical='center')
# Apply fill colors to clan information
ws['G2'].fill = PatternFill(start_color='FFFFFF', end_color='FFFFFF', fill_type='solid') # White fill
ws['H2'].fill = PatternFill(start_color='FFFFFF', end_color='FFFFFF', fill_type='solid') # White fill
ws['G3'].fill = PatternFill(start_color='D9D9D9', end_color='D9D9D9', fill_type='solid') # Light gray fill
ws['H3'].fill = PatternFill(start_color='D9D9D9', end_color='D9D9D9', fill_type='solid') # Light gray fill
ws['G4'].fill = PatternFill(start_color='FFFFFF', end_color='FFFFFF', fill_type='solid') # White fill
ws['H4'].fill = PatternFill(start_color='FFFFFF', end_color='FFFFFF', fill_type='solid') # White fill
# Apply borders to clan information
for cell in info_cells:
ws[cell].border = border
# Align text in Column G to the right and text in Column H to the left within the clan information
for row in ws.iter_rows(min_row=2, max_row=4, min_col=7, max_col=8):
for cell in row:
if cell.column == 7: # Column G
cell.alignment = Alignment(horizontal='right', vertical='center')
else: # Column H
cell.alignment = Alignment(horizontal='left', vertical='center')
# Run the asyncio event loop
asyncio.run(main())
# Remove the default sheet
default_sheet = wb['Sheet']
wb.remove(default_sheet)
# Save the workbook
wb.save('clan_data.xlsx')
print("Excel file generated successfully.")
2nd Script: player_war_scraper.py
import sys
import time
import io
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Placeholder value for the number of rows of information to generate
num_rows = 12
def scrape_clan_wars_info(player_tag):
# Construct the URL with the player tag variable
url = f"https://royaleapi.com/player/{player_tag}"
# Configure Chrome options for headless mode and disable image loading
chrome_options = Options()
chrome_options.add_argument("--headless") # Run Chrome in headless mode
chrome_options.add_argument("--disable-gpu") # Disable GPU acceleration
chrome_options.add_argument("--disable-infobars") # Disable info bars
chrome_options.add_argument("--disable-dev-shm-usage") # Disable /dev/shm usage
chrome_options.add_argument("--no-sandbox") # Disable sandbox mode
chrome_options.add_argument("--disable-extensions") # Disable extensions
chrome_options.add_argument("--disable-browser-side-navigation") # Disable browser side navigation
chrome_options.add_argument("--disable-features=VizDisplayCompositor") # Disable viz display compositor
chrome_options.add_argument("--start-maximized") # Start maximized
chrome_options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) # Disable image loading
# Initialize Chrome WebDriver with configured options
driver = webdriver.Chrome(options=chrome_options)
# Set implicit wait time
driver.implicitly_wait(10)
# Open the webpage
driver.get(url)
try:
# Switch to the iframe
driver.switch_to.frame("sp_message_iframe_1104950")
# Switch to the document within the iframe
iframe_document = driver.find_element(By.TAG_NAME, "html")
driver.switch_to.frame(iframe_document)
# Locate the "Accept" button within the dropdown
accept_button = driver.find_element(By.XPATH, "//button[contains(text(), 'Accept')]")
# Click the "Accept" button
accept_button.click()
# Switch back to the default content
driver.switch_to.default_content()
# Click the button using JavaScript
load_button = driver.find_element(By.CLASS_NAME, "cw2_history_button")
driver.execute_script("arguments[0].click();", load_button)
# Wait for the table to be loaded
table = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, "player__cw2_history_table")))
# Get the table headers with their respective data content
headers = [th.get_attribute("data-content") for th in table.find_elements(By.TAG_NAME, "th")]
# Specify the columns to select
selected_columns = ['Season ID', 'Date', 'Clan Name', 'Decks Used', 'Fame']
# Get the table data
table_html = table.get_attribute('outerHTML')
# Read the HTML into a DataFrame
df = pd.read_html(table_html)[0]
# Rename the columns with their respective data content
df.columns = headers
# Select only the specified columns
df_selected_columns = df[selected_columns]
# Select only the specified number of rows
df_selected_rows = df_selected_columns.head(num_rows)
# Export the DataFrame to an Excel file
file_name = f"Clan_Wars_2_History_Selected_{player_tag}.xlsx"
df_selected_rows.to_excel(file_name, index=False)
print(f"Selected {num_rows} rows of Clan Wars 2 history with specified columns successfully exported to {file_name}")
finally:
# Close the WebDriver
driver.quit()
# Check if player tag is provided as command-line argument
if len(sys.argv) != 2:
print("Please provide the player tag:")
sys.exit(1)
player_tag = sys.argv[1]
scrape_clan_wars_info(player_tag)