Scrape the Bottom of the Barrel

It turns out that one can pull data from RealClearPolitics without having their IP banned for 24 hours! Now I can knock off my final intermediate scraper training.

The following script will retrieve the RCP average percentage for Trump versus Biden (i.e., Pumpkin versus Potatus), as well as how our veggie in chief is doing. Despite attempting to pull it using a post request with all the necessary headers, it consistently failed. I suspect they’re blocking most methods. Eventually, I resorted to our reliable Playwright framework to accomplish the task.

The latest update (included below) adds in a comparison to previous values by writing to an external json file.

Election 2024

election_2024.py

    from playwright.sync_api import sync_playwright
    import json
    from slack_lib import send_message_to_slack
    import time
    import random
    import os

    # Mapping poll IDs to their titles
    poll_titles = {
        7386: "General Election",
        7320: "Job Approval",
        7321: "Economy",
        7359: "Immigration",
        7832: "Inflation",
        7833: "Crime"
    }

    # File to store previous results
    PREVIOUS_RESULTS_FILE = 'previous_results.json'

    def load_previous_results():
        if os.path.exists(PREVIOUS_RESULTS_FILE):
            with open(PREVIOUS_RESULTS_FILE, 'r') as file:
                return json.load(file)
        return {}

    def save_current_results(current_results):
        with open(PREVIOUS_RESULTS_FILE, 'w') as file:
            json.dump(current_results, file)

    def get_change_symbol(current, previous):
        if previous is None:
            return "🔹"
        elif float(current) > float(previous):
            return "👍"
        elif float(current) < float(previous):
            return "🔻"
        else:
            return "🔹"

    def get_differences(current_results, previous_results):
        differences = {}
        for poll_id, candidate_results in current_results.items():
            poll_differences = []
            for candidate in candidate_results:
                previous_value = None
                if str(poll_id) in previous_results:
                    for prev_candidate in previous_results[str(poll_id)]:
                        if prev_candidate['name'] == candidate['name']:
                            previous_value = prev_candidate['value']
                            break
                if previous_value is not None:
                if candidate['value'] != previous_value:
                    poll_differences.append(candidate)
            if poll_differences:
                differences[poll_id] = poll_differences
        return differences

    def format_results(differences, previous_results):
        formatted_message = ""
        for poll_id, candidate_results in differences.items():
            poll_title = poll_titles.get(poll_id, f"Poll ID {poll_id}")
            formatted_message += f"> *{poll_title}*: \t"
            for candidate in candidate_results:
                previous_value = None
                if str(poll_id) in previous_results:
                    for prev_candidate in previous_results[str(poll_id)]:
                        if prev_candidate['name'] == candidate['name']:
                            previous_value = prev_candidate['value']
                            break
                change_symbol = get_change_symbol(candidate['value'], previous_value)
                formatted_message += f"{candidate['name']} {candidate['value']}% {change_symbol} \t"
            formatted_message += "\n"
        return formatted_message

    def fetch_poll_data(playwright, poll_id):
        browser = playwright.chromium.launch(
            headless=True,
            args=[
                '--no-sandbox',
                '--disable-setuid-sandbox',
                '--disable-infobars',
                '--window-size=1920,1080',
                '--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
            ]
        )
        page = browser.new_page()
        page.set_viewport_size({"width": 1920, "height": 1080})

        url = f"https://www.realclearpolitics.com/poll/race/{poll_id}/polling_data.json"
        response = page.goto(url)

        candidate_results = []

        if response.status == 200:
            try:
                json_data = response.json()
                for candidate_data in json_data['poll'][0]['candidate']:
                    candidate_info = {'name': candidate_data['name'], 'value': candidate_data['value']}
                    candidate_results.append(candidate_info)
            except Exception as e:
                print(f"Error parsing JSON data for poll_id {poll_id}: {e}")
        else:
            print(f"Failed to fetch data for poll_id {poll_id}: HTTP {response.status}")
            print(response.text())

        browser.close()
        return poll_id, candidate_results

    def main():
        all_poll_results = {}
        previous_results = load_previous_results()

        with sync_playwright() as playwright:
            for poll_id in poll_titles.keys():
                time.sleep(random.randint(5, 14))
                poll_id, results = fetch_poll_data(playwright, poll_id)
                if results:
                    all_poll_results[poll_id] = results

        differences = get_differences(all_poll_results, previous_results)

        if differences:
            formatted_text = format_results(differences, previous_results)

            send_message_to_slack(formatted_text, username="👍 Election Bot 👎")

        save_current_results(all_poll_results)

    if __name__ == "__main__":
        main()

Below is a script to find other polls of interest. Just toss in numbers in the poll_titles variable and let the script tell you what’s what. Note that 7383 is the 2024 General Election and 7320 is President Potato’s approval rating. I put a few seconds of sleep in between requests to keep from being flagged.

scrapey_poll_list.py

    import requests
    import time

    def fetch_poll_titles(start, end):
        base_url = "https://www.realclearpolitics.com/poll/race/{}/polling_data.json"
        poll_titles = {}

        for poll_id in range(start, end + 1):
            url = base_url.format(poll_id)
            time.sleep(2)
            try:
                response = requests.get(url)
                response.raise_for_status()  # Check for HTTP errors
                data = response.json()
                title = data.get('moduleInfo', {}).get('title', 'No title found')
                poll_titles[poll_id] = title
            except requests.exceptions.RequestException as e:
                poll_titles[poll_id] = f"Error: {e}"
            except ValueError:
                poll_titles[poll_id] = "Invalid JSON"

        return poll_titles

    # Fetch titles for polls 7250 through 7400
    poll_titles = fetch_poll_titles(7250, 7400)

    for poll_id, title in poll_titles.items():
        print(f"Poll ID: {poll_id} - Title: {title}")

 

  • 02Aug2024
    • Changed name of Slack Bot to "Election Bot" to reflect potato dropping.
  • 23July2024
    • Updated `General Election` to Harris v. Trump. Biden v. Trump was poll #7383
Previous: Grabbing Stock Quotes Next: Godzilla Minus One