Chris McWain ᗧ··ᗣ·

Scrape the Bottom of the Barrel

May 2, 2024 at 12:10

Updated: 20 Feb 2025

It turns out that one can pull data from RealClearPolitics without having their IP banned for 24 hours! Now I can knock off my final intermediate scraper training.

The following script will retrieve the RCP average percentage for Trump versus Biden (i.e., Pumpkin versus Potatus), as well as how our veggie in chief is doing. Despite attempting to pull it using a post request with all the necessary headers, it consistently failed. I suspect they’re blocking most methods. Eventually, I resorted to our reliable Playwright framework to accomplish the task.

The latest update (included below) adds in a comparison to previous values by writing to an external json file.

political_polls.py

from playwright.sync_api import sync_playwright
import json
from urllib import request, parse
from slack_lib import send_message_to_slack     # my slack notification library
import time
import random
import os

poll_titles = {
    8656: "Approval",
    8659: "Immigration",
    8660: "Crime",
    8661: "Inflation",
    8662: "Israel",
    8663: "Russia",
    8664: "Abortion"
}

PREVIOUS_RESULTS_FILE = 'previous_results.json'

def load_previous_results():
    if os.path.exists(PREVIOUS_RESULTS_FILE):
        with open(PREVIOUS_RESULTS_FILE, 'r') as file:
            return json.load(file)
    return {}

def save_current_results(current_results):
    with open(PREVIOUS_RESULTS_FILE, 'w') as file:
        json.dump(current_results, file)

def get_change_symbol(current, previous):
    if previous is None:
        return "🔹"
    elif float(current) > float(previous):
        return "👍"
    elif float(current) < float(previous):
        return "🔻"
    else:
        return "🔹"

def get_differences(current_results, previous_results):
    differences = {}
    for poll_id, candidate_results in current_results.items():
        poll_differences = []
        for candidate in candidate_results:
            previous_value = None
            if str(poll_id) in previous_results:
                for prev_candidate in previous_results[str(poll_id)]:
                    if prev_candidate['name'] == candidate['name']:
                        previous_value = prev_candidate['value']
                        break
            if previous_value is not None:
                print(f"Comparing {candidate['name']} in poll {poll_id}: current value = {candidate['value']}, previous value = {previous_value}")  # Debugging
            if candidate['value'] != previous_value:
                poll_differences.append(candidate)
        if poll_differences:
            differences[poll_id] = poll_differences
    return differences

def format_results(differences, previous_results):
    formatted_message = ""
    for poll_id, candidate_results in differences.items():
        #print(f"Formatting results for poll_id {poll_id}: {candidate_results}")  # Debugging
        poll_title = poll_titles.get(poll_id, f"Poll ID {poll_id}")
        formatted_message += f"> *{poll_title}*: \t"
        for candidate in candidate_results:
            previous_value = None
            if str(poll_id) in previous_results:
                for prev_candidate in previous_results[str(poll_id)]:
                    if prev_candidate['name'] == candidate['name']:
                        previous_value = prev_candidate['value']
                        break
            change_symbol = get_change_symbol(candidate['value'], previous_value)
            formatted_message += f"{candidate['name']} {candidate['value']}% {change_symbol} \t"
        formatted_message += "\n"
    #print(f"Formatted message content: {formatted_message}")  # Debugging
    return formatted_message

def fetch_poll_data(playwright, poll_id):
    browser = playwright.chromium.launch(
        headless=True,
        args=[
            '--no-sandbox',
            '--disable-setuid-sandbox',
            '--disable-infobars',
            '--window-size=1920,1080',
            '--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        ]
    )
    page = browser.new_page()
    page.set_viewport_size({"width": 1920, "height": 1080})

    url = f"https://www.realclearpolitics.com/poll/race/{poll_id}/polling_data.json"
    response = page.goto(url)

    candidate_results = []

    if response.status == 200:
        try:
            json_data = response.json()
            #print(f"JSON data for poll_id {poll_id}: {json_data}")  # Debugging
            for candidate_data in json_data['poll'][0]['candidate']:
                candidate_info = {'name': candidate_data['name'], 'value': candidate_data['value']}
                candidate_results.append(candidate_info)
        except Exception as e:
            print(f"Error parsing JSON data for poll_id {poll_id}: {e}")
    else:
        print(f"Failed to fetch data for poll_id {poll_id}: HTTP {response.status}")
        print(response.text())

    browser.close()
    return poll_id, candidate_results

def main():
    all_poll_results = {}
    previous_results = load_previous_results()

    #print("Previous results loaded:", previous_results)  # Debugging

    with sync_playwright() as playwright:
        for poll_id in poll_titles.keys():
            time.sleep(random.randint(5, 14))
            poll_id, results = fetch_poll_data(playwright, poll_id)
            #print(f"Results for poll_id {poll_id}: {results}")  # Debugging
            if results:
                all_poll_results[poll_id] = results

    #print("Current poll results:", all_poll_results)  # Debugging

    differences = get_differences(all_poll_results, previous_results)
    #print("Differences found:", differences)  # Debugging

    if differences:
        formatted_text = format_results(differences, previous_results)
        #print("Formatted text to send to Slack:", formatted_text)  # Debugging

        # Print the parameters before calling send_message_to_slack
        #print(f"Sending message to Slack with username: 👍 Biden Bot 👎 and text: {formatted_text}")

        send_message_to_slack(formatted_text, username="👍 Presidential Bot 👎")

    save_current_results(all_poll_results)

if __name__ == "__main__":
    main()

Below is a script to find other polls of interest. Just toss in numbers in the poll_titles variable and let the script tell you what’s what. Note that 7383 is the 2024 General Election and 7320 is President Potato’s approval rating. I put a few seconds of sleep in between requests to keep from being flagged.

scrapey_poll_list.py

    import requests
    import time

    def fetch_poll_titles(start, end):
        base_url = "https://www.realclearpolitics.com/poll/race/{}/polling_data.json"
        poll_titles = {}

        for poll_id in range(start, end + 1):
            url = base_url.format(poll_id)
            time.sleep(2)
            try:
                response = requests.get(url)
                response.raise_for_status()  # Check for HTTP errors
                data = response.json()
                title = data.get('moduleInfo', {}).get('title', 'No title found')
                poll_titles[poll_id] = title
            except requests.exceptions.RequestException as e:
                poll_titles[poll_id] = f"Error: {e}"
            except ValueError:
                poll_titles[poll_id] = "Invalid JSON"

        return poll_titles

    # Fetch titles for polls 7250 through 7400
    poll_titles = fetch_poll_titles(7250, 7400)

    for poll_id, title in poll_titles.items():
        print(f"Poll ID: {poll_id} - Title: {title}")

20 Feb 2025
- Updated poll numbers for the new President. They're just now starting to populate.
02 Aug 2024
- Changed name of Slack Bot to "Election Bot" to reflect potato dropping.
23 July 2024
- Updated `General Election` to Harris v. Trump. Biden v. Trump was poll #7383

Questions or comments?

Recent Activity

Scrape the Bottom of the Barrel

Tags