Repeating set of data which shouldn't be repeating on excel spreadsheet

micmoo · July 25, 2024, 5:28pm

Hi,

This python script that I’ve made web scrapes data and produces an excel file. However it seems to have a repeating set of data which I don’t want. Here is the code in question, more specifically, the async def generate_clan_war_stats function.

aftermath_militia_scraper.py

import http.client
import urllib
import asyncio
import aiohttp
import re
import httpx
import logging
from bs4 import BeautifulSoup
from openpyxl import Workbook
import openpyxl
from openpyxl.styles import Font, Alignment, PatternFill, Border, Side
from datetime import datetime
import pandas as pd
from player_war_scraper import scrape_clan_wars_info  # Import the scrape_clan_wars_info function from player_war_scraper.py
from selenium.webdriver.remote.remote_connection import LOGGER
import tqdm

# Set the logging level to suppress messages
logging.getLogger('urllib3').setLevel(logging.WARNING)
LOGGER.setLevel(logging.WARNING)

# List of clan URLs with corresponding names
clan_urls = [
    # ('https://royaleapi.com/clan/8CYR2V', 'Aftermath'),
    # ('https://royaleapi.com/clan/L2RRRGVG', 'Afterlife'),
    # ('https://royaleapi.com/clan/YC8R0RJ0', 'Aftershock'),
    # ('https://royaleapi.com/clan/LUYLLQJR', 'Afterglow'),
    # ('https://royaleapi.com/clan/Q0JRGC22', 'Afterparty'),
    # ('https://royaleapi.com/clan/QY920UQR', 'Afterburn'),
    ('https://royaleapi.com/clan/G22P0PUG', 'Aftertaste')
]

# Maximum number of player tags to process
MAX_PLAYER_TAGS = 1  # Set this to any number up to 50

async def fetch_clan_data(url, clan_name):
    try:
        async with httpx.AsyncClient() as client:
            response = await client.get(url)
            response.raise_for_status()
            return clan_name, response.text
    except Exception as e:
        print(f"Error fetching data from {url}: {e}")
        return clan_name, ""

async def fetch_all_clan_data():
    tasks = [fetch_clan_data(url, clan_name) for url, clan_name in clan_urls]
    return await asyncio.gather(*tasks)

async def generate_main_spreadsheet(clan_data):
    try:
        # Create a new workbook for the main spreadsheet
        main_wb = openpyxl.Workbook()
        # Delete the default sheet
        main_wb.remove(main_wb.active)

        clan_players_data = {}

        # Iterate over each clan's data
        for clan_name, content in clan_data:
            if not content:
                print(f"No content for {clan_name}, skipping...")
                continue

            soup = BeautifulSoup(content, 'html.parser')
            # Find the roster table
            roster_table = soup.find('table', id='roster')

            if roster_table:
                # Create a new worksheet for the current clan with the clan name
                ws = main_wb.create_sheet(title=clan_name)
                # Extract headers from the table
                headers = ['Player Tag', 'Name']
                # Write headers to the worksheet
                ws.append(headers)
                # Extract rows from the table
                rows = roster_table.find('tbody').find_all('tr')

                clan_players = []

                for row in rows:
                    try:
                        # Extract player tag
                        player_tag_div = row.find('div', class_='mobile-hide muted')
                        player_tag = player_tag_div.text.strip().replace('#', '') if player_tag_div else ''
                        # Extract player name
                        name_a = row.find('a', class_='block member_link')
                        name = name_a.contents[0].strip() if name_a else ''
                        # Write row data to the worksheet
                        ws.append([player_tag, name])

                        # Collect player data for war stats generation
                        clan_players.append((player_tag, name))
                    except AttributeError as e:
                        print(f"AttributeError: {str(e)}")
                        continue

                # Store clan players data
                clan_players_data[clan_name] = clan_players

                # Apply formatting to headers
                for row in ws.iter_rows(min_row=1, max_row=1):
                    for cell in row:
                        cell.font = Font(size=9)
                        cell.alignment = Alignment(horizontal='center', vertical='center')

                # Adjust column widths
                for col in ws.columns:
                    max_length = 0
                    column = col[0].column_letter
                    for cell in col:
                        try:
                            if len(str(cell.value)) > max_length:
                                max_length = len(cell.value)
                        except:
                            pass
                    adjusted_width = (max_length + 2) * 1.2
                    ws.column_dimensions[column].width = adjusted_width

                # Apply borders to all cells
                border = Border(left=Side(border_style='thin', color='BFBFBF'),
                                right=Side(border_style='thin', color='BFBFBF'),
                                top=Side(border_style='thin', color='BFBFBF'),
                                bottom=Side(border_style='thin', color='BFBFBF'))
                for row in ws.iter_rows():
                    for cell in row:
                        cell.border = border

                # Display clan information
                members_count = len(rows)
                last_updated = datetime.now().strftime("%d/%m/%Y | %H:%M:%S")
                ws['C2'] = f'Clan: '
                ws['D2'] = f'{clan_name}'
                ws['C3'] = f'Members: '
                ws['D3'] = f'{members_count}'
                ws['C4'] = f'Last Updated: '
                ws['D4'] = f'{last_updated}'

                # Apply formatting to clan information
                info_cells = ['C2', 'D2', 'C3', 'D3', 'C4', 'D4']
                for cell in info_cells:
                    ws[cell].font = Font(size=11)
                    ws[cell].alignment = Alignment(horizontal='right', vertical='center')

                # Apply fill colors to clan information
                ws['C2'].fill = PatternFill(start_color='FFFFFF', end_color='FFFFFF', fill_type='solid')
                ws['D2'].fill = PatternFill(start_color='FFFFFF', end_color='FFFFFF', fill_type='solid')
                ws['C3'].fill = PatternFill(start_color='D9D9D9', end_color='D9D9D9', fill_type='solid')
                ws['D3'].fill = PatternFill(start_color='D9D9D9', end_color='D9D9D9', fill_type='solid')
                ws['C4'].fill = PatternFill(start_color='FFFFFF', end_color='FFFFFF', fill_type='solid')
                ws['D4'].fill = PatternFill(start_color='FFFFFF', end_color='FFFFFF', fill_type='solid')

                # Apply borders to clan information cells
                for cell in info_cells:
                    ws[cell].border = border

                # Align text in Column C to the right and text in Column D to the left within the clan information
                for row in ws.iter_rows(min_row=2, max_row=4, min_col=3, max_col=4):
                    for cell in row:
                        if cell.column == 3:
                            cell.alignment = Alignment(horizontal='right', vertical='center')
                        else:
                            cell.alignment = Alignment(horizontal='left', vertical='center')

        # Save the main spreadsheet
        main_wb.save("Aftermath_Militia_Player_Data.xlsx")
        print("Main spreadsheet generation completed successfully.")

        return clan_players_data
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return {}

async def generate_clan_war_stats(clan_data, max_player_tags=50):
    try:
        for clan_name, players_data in clan_data.items():
            print(f"Processing war stats for clan {clan_name}...")

            # Create a new workbook for the clan's war stats
            clan_wb = openpyxl.Workbook()

            # Limit the number of players to process based on max_player_tags
            limited_players_data = players_data[:max_player_tags]

            for idx, (player_tag, player_name) in enumerate(tqdm.tqdm(limited_players_data, desc=f'Clan: {clan_name}'), start=1):
                print(f"Gathering war information for {player_tag} | #{idx}...")

                # Skip invalid player tags
                if not re.match(r'^#?[a-zA-Z0-9]+$', player_tag):
                    print(f"Invalid player tag '{player_tag}', skipping...")
                    continue

                try:
                    war_stats = await scrape_clan_wars_info(player_tag)  # Scrape war stats for the player
                    if war_stats:
                        player_war_stats = war_stats[0]['War Statistics']  # Extract war stats for the player

                        # Create a new worksheet for the player within the clan's workbook
                        ws = clan_wb.create_sheet(title=player_tag)

                        # Write player's war stats to the worksheet with Win Count and Win Percentage in the header
                        headers = ['You', 'Opp', 'Season ID', 'Date', 'Clan Name', 'Decks Used', 'Fame', 'Win Count', 'Win Percentage']
                        ws.append(headers)

                        # Write rows of player's war stats
                        for stat in player_war_stats:
                            row = list(stat.values())
                            ws.append(row)

                        print(f"War information for {player_tag} gathered successfully.")
                    else:
                        print(f"No war information found for {player_tag}.")
                except Exception as e:
                    print(f"An error occurred while gathering war information for {player_tag}: {str(e)}")

            # Save the clan's war stats to a single file after processing all players
            clan_stats_filename = f"{clan_name}_war_stats.xlsx"

            # Remove the blank sheet created at the beginning
            clan_wb.remove(clan_wb.active)

            # Save the edited workbook
            clan_wb.save(clan_stats_filename)

            print(f"War stats for clan {clan_name} generated successfully.")

    except Exception as e:
        print(f"An error occurred while generating war stats: {str(e)}")

async def main():
    try:
        clan_data = await fetch_all_clan_data()
        clan_players_data = await generate_main_spreadsheet(clan_data)
        await generate_clan_war_stats(clan_players_data, max_player_tags=MAX_PLAYER_TAGS)
    except Exception as e:
        print(f"An error occurred in main: {str(e)}")

if __name__ == "__main__":
    asyncio.run(main())

That is the main block of code, and then there is this code which is needed for the main script to run:

player_war_scraper.py

import re
import sys
import asyncio
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import httpx
from bs4 import BeautifulSoup
from io import StringIO

# Placeholder value for the number of rows of information to generate
num_rows = 12

async def scrape_clan_wars_info(player_tag):
    try:
        # Scrape card levels
        card_levels_df = await scrape_card_levels(player_tag)

        # Scrape war stats
        war_stats_df = await scrape_war_stats(player_tag)
        
        # Scrape win percentage
        count, win_percentage = await scrape_win_percentage(player_tag)

        # Merge card levels and war stats into one DataFrame
        combined_df = pd.concat([card_levels_df, war_stats_df], axis=1)

        # Add win statistics columns
        combined_df['Count'] = count
        combined_df['Percentage'] = win_percentage

        # Convert the DataFrame to a list of dictionaries
        combined_data = combined_df.to_dict(orient='records')

        return [{'War Statistics': combined_data}]
    except Exception as e:
        print(f"An error occurred while scraping war information for player {player_tag}: {str(e)}")
        return []

async def scrape_card_levels(player_tag):
    urls = [
        f"https://royaleapi.com/player/{player_tag}/battles/history?battle_type=warAny",
        f"https://royaleapi.com/player/{player_tag}/battles/history?before=1716772359000&&battle_type=warAny"
    ]

    all_data = []
    unique_battles = set()  # To keep track of unique battles

    for idx, url in enumerate(urls):
        data = await scrape_data(url)
        for battle_data in data:
            if tuple(battle_data['You']) not in unique_battles:  # Check if battle data already seen
                all_data.append(battle_data)
                unique_battles.add(tuple(battle_data['You']))  # Add battle data to the set of unique battles
        if idx < len(urls) - 1:  # Add an empty row between data from each URL
            all_data.append({})

    # Combine all battles into one DataFrame
    df = pd.DataFrame(columns=['You', 'Opp'])
    for battle_data in all_data:
        if not battle_data:  # If empty row, skip
            continue
        temp_df = pd.DataFrame(battle_data)
        df = pd.concat([df, temp_df], ignore_index=True)

    # Convert 'You' and 'Opp' columns to numeric
    df['You'] = df['You'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
    df['Opp'] = df['Opp'].apply(lambda x: pd.to_numeric(x, errors='coerce'))

    return df

async def scrape_data(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    async with httpx.AsyncClient() as client:
        response = await client.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        all_battles = []
        segments_seen = set()  # To keep track of segments already seen

        for battle in soup.find_all('div', class_=re.compile(r'\bbattle_\w+')):
            segments = battle.find_all('div', class_='ui basic segment team-segment')
            segment_ids = tuple(id(seg) for seg in segments)  # Get unique IDs for segments

            if segment_ids not in segments_seen:  # Check if segments already seen
                you_cards = []
                opp_cards = []

                for i, segment in enumerate(segments):
                    card_levels = segment.find_all('div', class_='ui basic center card-level')
                    if i % 2 == 0:  # Player's segment
                        # Extract only the numbers from card levels and convert to integers
                        you_cards.extend([int(re.search(r'\d+', card.get_text()).group()) for card in card_levels])
                    else:  # Opponent's segment
                        # Extract only the numbers from card levels and convert to integers
                        opp_cards.extend([int(re.search(r'\d+', card.get_text()).group()) for card in card_levels])

                # Ensure the length of the lists is the same
                max_length = max(len(you_cards), len(opp_cards))
                you_cards.extend([None] * (max_length - len(you_cards)))
                opp_cards.extend([None] * (max_length - len(opp_cards)))

                all_battles.append({'You': you_cards, 'Opp': opp_cards})
                segments_seen.add(segment_ids)  # Add segment IDs to the set

        return all_battles

async def scrape_war_stats(player_tag):
    war_stats = []
    url = f"https://royaleapi.com/player/{player_tag}"
    async with httpx.AsyncClient() as client:
        response = await client.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'lxml')

    # Configure Chrome options for headless mode and disable image loading
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run Chrome in headless mode
    chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
    chrome_options.add_argument("--disable-infobars")  # Disable info bars
    chrome_options.add_argument("--disable-dev-shm-usage")  # Disable /dev/shm usage
    chrome_options.add_argument("--no-sandbox")  # Disable sandbox mode
    chrome_options.add_argument("--disable-extensions")  # Disable extensions
    chrome_options.add_argument("--disable-browser-side-navigation")  # Disable browser side navigation
    chrome_options.add_argument("--disable-features=VizDisplayCompositor")  # Disable viz display compositor
    chrome_options.add_argument("--start-maximized")  # Start maximized
    chrome_options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})  # Disable image loading
    chrome_options.add_argument("--log-level=3")  # Suppress logging to the console

    # Initialize Chrome WebDriver with configured options
    driver = webdriver.Chrome(options=chrome_options)

    load_time = 1200

    # Set implicit wait time
    driver.implicitly_wait(load_time)

    # Open the webpage
    driver.get(url)

    try:
        accept_cookies(driver)

        # Click the button to load war history
        load_war_history(driver)

        # Wait for the table to be loaded
        table = WebDriverWait(driver, load_time).until(EC.visibility_of_element_located((By.CLASS_NAME, "player__cw2_history_table")))

        # Get the table headers with their respective data content
        headers = [th.get_attribute("data-content") for th in table.find_elements(By.TAG_NAME, "th")]

        # Specify the columns to select
        selected_columns = ['Season ID', 'Date', 'Clan Name', 'Decks Used', 'Fame']

        # Get the table data
        table_html = table.get_attribute('outerHTML')

        # Read the HTML into a DataFrame
        df = pd.read_html(StringIO(table_html))[0]

        # Rename the columns with their respective data content
        df.columns = headers

        # Select only the specified columns
        df_selected_columns = df[selected_columns]

        # Select only the specified number of rows
        df_selected_rows = df_selected_columns.head(num_rows)

        # Convert the DataFrame to a dictionary
        player_war_stats = df_selected_rows.to_dict(orient='records')

        # Create a dictionary to hold all war stats for the player
        consolidated_stats = {'Player Tag': player_tag, 'War Statistics': player_war_stats}

        # Append consolidated stats to the war_stats list
        war_stats.append(consolidated_stats)

    finally:
        # Close the WebDriver
        driver.quit()

    return pd.concat([pd.DataFrame(stats['War Statistics']) for stats in war_stats])

async def scrape_win_percentage(player_tag):
    try:
        url = f"https://royaleapi.com/player/{player_tag}"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
        }
        async with httpx.AsyncClient() as client:
            response = await client.get(url, headers=headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Initialize variables to store win statistics
            win_percentage = None
            count = None

            # Find all tables in the page
            tables = soup.find_all('table')

            # Look for the table containing "Percentage" and "Count"
            for table in tables:
                if "Percentage" in table.get_text() and "Count" in table.get_text():
                    # Find the tbody element within the table
                    tbody = table.find('tbody')
                    # Extract the first row (index 0) from tbody
                    if tbody:
                        first_row = tbody.find('tr')
                        # Extract text content from the first row
                        if first_row:
                            cells = first_row.find_all('td')
                            if len(cells) >= 3:
                                count = cells[1].text.strip()
                                win_percentage = cells[2].text.strip()
                            break
            
            # Check if both count and win_percentage were found
            if count is not None and win_percentage is not None:
                return count, win_percentage  # Return count and win_percentage separately
            else:
                print("Count or Percentage table not found.")
                return None, None  # Return None for both values if not found
    except Exception as e:
        print(f"An error occurred while scraping win percentage for player {player_tag}: {str(e)}")
        return None, None  # Return None for both values in case of any error

def accept_cookies(driver):
    try:
        # Switch to the iframe
        driver.switch_to.frame("sp_message_iframe_1169948")

        # Switch to the document within the iframe
        iframe_document = driver.find_element(By.TAG_NAME, "html")
        driver.switch_to.frame(iframe_document)

        # Locate the "Accept" button within the dropdown
        accept_button = driver.find_element(By.XPATH, "//button[contains(text(), 'Accept')]")

        # Click the "Accept" button
        accept_button.click()

        # Switch back to the default content
        driver.switch_to.default_content()
    except:
        pass  # No cookie prompt found, continue without accepting

def load_war_history(driver):
    # Click the button using JavaScript
    load_button = driver.find_element(By.CLASS_NAME, "cw2_history_button")
    driver.execute_script("arguments[0].click();", load_button)

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Please provide the player tag:")
        sys.exit(1)

    player_tag = sys.argv[1]
    asyncio.run(scrape_clan_wars_info(player_tag))

I’ve also added the image of the excel file in question. The “You” and “Opp” columns are meant to be like that, however as you can see, the Win Count and Win Percentage are mimicking the column in terms of how many rows its occupying. I only want the Win Count and Win Percentage to displaying as 1 row, not several.