It turns out that one can pull data from RealClearPolitics without having their IP banned for 24 hours! Now I can knock off my final intermediate scraper training.
The following script will retrieve the RCP average percentage for Trump versus Biden (i.e., Pumpkin versus Potatus), as well as how our veggie in chief is doing. Despite attempting to pull it using a post request with all the necessary headers, it consistently failed. I suspect they’re blocking most methods. Eventually, I resorted to our reliable Playwright framework to accomplish the task.
The latest update (included below) adds in a comparison to previous values by writing to an external json file.
political_polls.py
from playwright.sync_api import sync_playwright
import json
from urllib import request, parse
from slack_lib import send_message_to_slack # my slack notification libraryimport time
import random
import os
poll_titles = {
8656: "Approval",
8659: "Immigration",
8660: "Crime",
8661: "Inflation",
8662: "Israel",
8663: "Russia",
8664: "Abortion"
}
PREVIOUS_RESULTS_FILE ='previous_results.json'defload_previous_results():
if os.path.exists(PREVIOUS_RESULTS_FILE):
with open(PREVIOUS_RESULTS_FILE, 'r') as file:
return json.load(file)
return {}
defsave_current_results(current_results):
with open(PREVIOUS_RESULTS_FILE, 'w') as file:
json.dump(current_results, file)
defget_change_symbol(current, previous):
if previous isNone:
return"🔹"elif float(current) > float(previous):
return"👍"elif float(current) < float(previous):
return"🔻"else:
return"🔹"defget_differences(current_results, previous_results):
differences = {}
for poll_id, candidate_results in current_results.items():
poll_differences = []
for candidate in candidate_results:
previous_value =Noneif str(poll_id) in previous_results:
for prev_candidate in previous_results[str(poll_id)]:
if prev_candidate['name'] == candidate['name']:
previous_value = prev_candidate['value']
breakif previous_value isnotNone:
print(f"Comparing {candidate['name']} in poll {poll_id}: current value = {candidate['value']}, previous value = {previous_value}") # Debuggingif candidate['value'] != previous_value:
poll_differences.append(candidate)
if poll_differences:
differences[poll_id] = poll_differences
return differences
defformat_results(differences, previous_results):
formatted_message =""for poll_id, candidate_results in differences.items():
#print(f"Formatting results for poll_id {poll_id}: {candidate_results}") # Debugging
poll_title = poll_titles.get(poll_id, f"Poll ID {poll_id}")
formatted_message +=f"> *{poll_title}*: \t"for candidate in candidate_results:
previous_value =Noneif str(poll_id) in previous_results:
for prev_candidate in previous_results[str(poll_id)]:
if prev_candidate['name'] == candidate['name']:
previous_value = prev_candidate['value']
break
change_symbol = get_change_symbol(candidate['value'], previous_value)
formatted_message +=f"{candidate['name']}{candidate['value']}% {change_symbol}\t"
formatted_message +="\n"#print(f"Formatted message content: {formatted_message}") # Debuggingreturn formatted_message
deffetch_poll_data(playwright, poll_id):
browser = playwright.chromium.launch(
headless=True,
args=[
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-infobars',
'--window-size=1920,1080',
'--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
]
)
page = browser.new_page()
page.set_viewport_size({"width": 1920, "height": 1080})
url =f"https://www.realclearpolitics.com/poll/race/{poll_id}/polling_data.json"
response = page.goto(url)
candidate_results = []
if response.status ==200:
try:
json_data = response.json()
#print(f"JSON data for poll_id {poll_id}: {json_data}") # Debuggingfor candidate_data in json_data['poll'][0]['candidate']:
candidate_info = {'name': candidate_data['name'], 'value': candidate_data['value']}
candidate_results.append(candidate_info)
exceptExceptionas e:
print(f"Error parsing JSON data for poll_id {poll_id}: {e}")
else:
print(f"Failed to fetch data for poll_id {poll_id}: HTTP {response.status}")
print(response.text())
browser.close()
return poll_id, candidate_results
defmain():
all_poll_results = {}
previous_results = load_previous_results()
#print("Previous results loaded:", previous_results) # Debuggingwith sync_playwright() as playwright:
for poll_id in poll_titles.keys():
time.sleep(random.randint(5, 14))
poll_id, results = fetch_poll_data(playwright, poll_id)
#print(f"Results for poll_id {poll_id}: {results}") # Debuggingif results:
all_poll_results[poll_id] = results
#print("Current poll results:", all_poll_results) # Debugging
differences = get_differences(all_poll_results, previous_results)
#print("Differences found:", differences) # Debuggingif differences:
formatted_text = format_results(differences, previous_results)
#print("Formatted text to send to Slack:", formatted_text) # Debugging# Print the parameters before calling send_message_to_slack#print(f"Sending message to Slack with username: 👍 Biden Bot 👎 and text: {formatted_text}")
send_message_to_slack(formatted_text, username="👍 Presidential Bot 👎")
save_current_results(all_poll_results)
if __name__ =="__main__":
main()
Below is a script to find other polls of interest. Just toss in numbers in the poll_titles variable and let the script tell you what’s what. Note that 7383 is the 2024 General Election and 7320 is President Potato’s approval rating. I put a few seconds of sleep in between requests to keep from being flagged.
scrapey_poll_list.py
import requests
import time
deffetch_poll_titles(start, end):
base_url ="https://www.realclearpolitics.com/poll/race/{}/polling_data.json"
poll_titles = {}
for poll_id in range(start, end +1):
url = base_url.format(poll_id)
time.sleep(2)
try:
response = requests.get(url)
response.raise_for_status() # Check for HTTP errors
data = response.json()
title = data.get('moduleInfo', {}).get('title', 'No title found')
poll_titles[poll_id] = title
except requests.exceptions.RequestException as e:
poll_titles[poll_id] =f"Error: {e}"exceptValueError:
poll_titles[poll_id] ="Invalid JSON"return poll_titles
# Fetch titles for polls 7250 through 7400
poll_titles = fetch_poll_titles(7250, 7400)
for poll_id, title in poll_titles.items():
print(f"Poll ID: {poll_id} - Title: {title}")
20 Feb 2025
Updated poll numbers for the new President. They're just now starting to populate.
02 Aug 2024
Changed name of Slack Bot to "Election Bot" to reflect potato dropping.
23 July 2024
Updated `General Election` to Harris v. Trump. Biden v. Trump was poll #7383