Scrape the Bottom of the Barrel

It turns out that one can pull data from RealClearPolitics without having their IP banned for 24 hours! Now I can knock off my final intermediate scraper training.

The following script will retrieve the RCP average percentage for Trump versus Biden (i.e., Pumpkin versus Potatus). Despite attempting to pull it using a post request with all the necessary headers, it consistently failed. I suspect they’re blocking most methods. Eventually, I resorted to our reliable Playwright framework to accomplish the task.

Election 2024

election2024.py

from playwright.sync_api import sync_playwright
import json
from urllib import request, parse

def format_results(candidate_results):
    formatted_message = ""  # Properly initialize formatted_message
    for candidate in candidate_results:
        formatted_message += f"> {candidate['name']}: {candidate['value']}%\n"
    return formatted_message

def send_message_to_slack(text):
    post = {"text": text}  # Use the provided text directly
    try:
        json_data = json.dumps(post)
        # Replace 'APIKEY' with your actual Slack webhook URL segment
        req = request.Request("https://hooks.slack.com/services/APIKEY",
                              data=json_data.encode('utf-8'),  # Ensure correct encoding
                              headers={'Content-Type': 'application/json'})
        resp = request.urlopen(req)
        #print("Message sent successfully: ", resp.read())  # Feedback on successful send
    except Exception as em:
        print("EXCEPTION: " + str(em))

def run(playwright):
    browser = playwright.chromium.launch(
        headless=True,
        args=[
            '--no-sandbox',
            '--disable-setuid-sandbox',
            '--disable-infobars',
            '--window-size=1920,1080',
            '--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        ]
    )
    page = browser.new_page()
    page.set_viewport_size({"width": 1920, "height": 1080})

    response = page.goto("https://www.realclearpolitics.com/poll/race/7383/polling_data.json")
    # use poll 7320 for approval rating
    
    candidate_results = []

    if response.ok:
        json_data = response.json()
        for candidate_data in json_data['poll'][0]['candidate']:
            candidate_info = {'name': candidate_data['name'], 'value': candidate_data['value']}
            candidate_results.append(candidate_info)

    browser.close()
    return candidate_results

with sync_playwright() as playwright:
    results = run(playwright)
    formatted_text = format_results(results)
    send_message_to_slack("🪧 *Election Bot* 🗳️\n" + formatted_text)
Previous: Grabbing Stock Quotes Next: Godzilla Minus One