python - Html in the inspect element is different that the one displayed on screen - Stack Overflow

时间: 2025-01-06 admin 业界

I am trying to scrap the data from this website .aspx?Game=2009_1211_2563_2684-Lebanon

The website contains two tables: but the data displayed in the row of the tables is different than the one in the html source (after doing inspect element).

for example this is the data for the first row:

<tr class="my_pStats1" onmouseover="this.style.backgroundColor='#C3C3C3';" onmouseout="this.style.backgroundColor='#FFFFFF';" valign="center" height="17" style="background-color: rgb(255, 255, 255);">
<td class="headcol">&nbsp;</td>
<td class="headcol2 my_playerName" align="left"><a class="my_playerB" href=";><font color="#0066cc">SMdRl-XIuQ, zRij</font></a></td>
<td>45</td>
<td>4-9 (38.7%)</td>
<td>0-9 (96.3%)</td>
<td>5-5 (5%)</td>
<td class="hiddensmall">5</td>
<td class="hiddensmall">6</td>
<td>6</td>
<td>1</td>
<td>6</td>
<td class="hiddensmall">5</td>
<td class="hiddensmall">5</td>
<td class="hiddensmall">6</td>
<td class="hiddensmall">5</td>
<td class="hiddensmall">8</td>
<td>86</td>
<td class="hiddensmall">5</td>
<td class="hiddensmall">5</td></tr>

but the name of the player is Jean Abdel-Nour and not SMdRl-XIuQ, zRij and similar thing for the numbers.

I Tried selenium but didn't work

import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver

def extract_box_score_from_url(url):
    # Fetch the webpage content
    driver = webdriver.Chrome()  # Ensure ChromeDriver is installed and in PATH
    driver.get(url)
    html_content = driver.page_source
    soup = BeautifulSoup(html_content, 'html.parser')
    driver.quit()
    
    # Extract team and opponent names
    team = soup.find('table', {'id': 'aannew'}).find('a').text.strip()
    opponent = soup.find_all('table', {'id': 'aannew'})[1].find('a').text.strip()

    # Extract headers
    stats_divs = soup.find_all('div', class_='dvbs')
    header_rows = stats_divs[0].find('thead').find_all('tr')
    
    # Flatten headers by concatenating main headers and subheaders
    headers = []
    for th in header_rows[1].find_all('th'):  # Process the second header row
        main_header = th.get('colspan', None)
        sub_header = th.get_text(strip=True)
        headers.append(sub_header)

    # Add Team and Opponent columns to headers
    headers += ['Team', 'Opponent']

    # Function to extract stats table for a team
    def extract_team_stats(dvbs):
        rows = dvbs.find('tbody').find_all('tr', class_=['my_pStats1', 'my_pStats2'])
        stats = []
        for row in rows:
            cols = row.find_all('td')
            player_data = [col.get_text(strip=True) for col in cols]
            stats.append(player_data)
        return stats

    # Extract stats for both teams
    team_stats = extract_team_stats(stats_divs[0])
    opponent_stats = extract_team_stats(stats_divs[1])

    # Add Team and Opponent columns
    num_columns = len(headers)
    team_stats = [row + [team, opponent] for row in team_stats if len(row) + 2 == num_columns]
    opponent_stats = [row + [opponent, team] for row in opponent_stats if len(row) + 2 == num_columns]

    # Combine data
    combined_stats = team_stats + opponent_stats

    # Create dataframe
    df = pd.DataFrame(combined_stats, columns=headers)

    return df

url = ".aspx?Game=2009_1211_2563_2684-Lebanon"
df = extract_box_score_from_url(url)

df

can you please help me in finding a way to scrape this data? I tried Selenium

import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver

def extract_box_score_from_url(url):
    # Fetch the webpage content
    driver = webdriver.Chrome()  # Ensure ChromeDriver is installed and in PATH
    driver.get(url)
    html_content = driver.page_source
    soup = BeautifulSoup(html_content, 'html.parser')
    driver.quit()
    
    # Extract team and opponent names
    team = soup.find('table', {'id': 'aannew'}).find('a').text.strip()
    opponent = soup.find_all('table', {'id': 'aannew'})[1].find('a').text.strip()

    # Extract headers
    stats_divs = soup.find_all('div', class_='dvbs')
    header_rows = stats_divs[0].find('thead').find_all('tr')
    
    # Flatten headers by concatenating main headers and subheaders
    headers = []
    for th in header_rows[1].find_all('th'):  # Process the second header row
        main_header = th.get('colspan', None)
        sub_header = th.get_text(strip=True)
        headers.append(sub_header)

    # Add Team and Opponent columns to headers
    headers += ['Team', 'Opponent']

    # Function to extract stats table for a team
    def extract_team_stats(dvbs):
        rows = dvbs.find('tbody').find_all('tr', class_=['my_pStats1', 'my_pStats2'])
        stats = []
        for row in rows:
            cols = row.find_all('td')
            player_data = [col.get_text(strip=True) for col in cols]
            stats.append(player_data)
        return stats

    # Extract stats for both teams
    team_stats = extract_team_stats(stats_divs[0])
    opponent_stats = extract_team_stats(stats_divs[1])

    # Add Team and Opponent columns
    num_columns = len(headers)
    team_stats = [row + [team, opponent] for row in team_stats if len(row) + 2 == num_columns]
    opponent_stats = [row + [opponent, team] for row in opponent_stats if len(row) + 2 == num_columns]

    # Combine data
    combined_stats = team_stats + opponent_stats

    # Create dataframe
    df = pd.DataFrame(combined_stats, columns=headers)

    return df

url = ".aspx?Game=2009_1211_2563_2684-Lebanon"
df = extract_box_score_from_url(url)

df
最新文章