- 02Aug2024
- Changed name of Slack Bot to "Election Bot" to reflect potato dropping.
- 23July2024
- Updated `General Election` to Harris v. Trump. Biden v. Trump was poll #7383
May 2, 2024 at 12:10
It turns out that one can pull data from RealClearPolitics without having their IP banned for 24 hours! Now I can knock off my final intermediate scraper training.
The following script will retrieve the RCP average percentage for Trump versus Biden (i.e., Pumpkin versus Potatus), as well as how our veggie in chief is doing. Despite attempting to pull it using a post request with all the necessary headers, it consistently failed. I suspect they’re blocking most methods. Eventually, I resorted to our reliable Playwright framework to accomplish the task.
The latest update (included below) adds in a comparison to previous values by writing to an external json file.
election_2024.py
from playwright.sync_api import sync_playwright
import json
from slack_lib import send_message_to_slack
import time
import random
import os
# Mapping poll IDs to their titles
poll_titles = {
7386: "General Election",
7320: "Job Approval",
7321: "Economy",
7359: "Immigration",
7832: "Inflation",
7833: "Crime"
}
# File to store previous results
PREVIOUS_RESULTS_FILE = 'previous_results.json'
def load_previous_results():
if os.path.exists(PREVIOUS_RESULTS_FILE):
with open(PREVIOUS_RESULTS_FILE, 'r') as file:
return json.load(file)
return {}
def save_current_results(current_results):
with open(PREVIOUS_RESULTS_FILE, 'w') as file:
json.dump(current_results, file)
def get_change_symbol(current, previous):
if previous is None:
return "🔹"
elif float(current) > float(previous):
return "👍"
elif float(current) < float(previous):
return "🔻"
else:
return "🔹"
def get_differences(current_results, previous_results):
differences = {}
for poll_id, candidate_results in current_results.items():
poll_differences = []
for candidate in candidate_results:
previous_value = None
if str(poll_id) in previous_results:
for prev_candidate in previous_results[str(poll_id)]:
if prev_candidate['name'] == candidate['name']:
previous_value = prev_candidate['value']
break
if previous_value is not None:
if candidate['value'] != previous_value:
poll_differences.append(candidate)
if poll_differences:
differences[poll_id] = poll_differences
return differences
def format_results(differences, previous_results):
formatted_message = ""
for poll_id, candidate_results in differences.items():
poll_title = poll_titles.get(poll_id, f"Poll ID {poll_id}")
formatted_message += f"> *{poll_title}*: \t"
for candidate in candidate_results:
previous_value = None
if str(poll_id) in previous_results:
for prev_candidate in previous_results[str(poll_id)]:
if prev_candidate['name'] == candidate['name']:
previous_value = prev_candidate['value']
break
change_symbol = get_change_symbol(candidate['value'], previous_value)
formatted_message += f"{candidate['name']} {candidate['value']}% {change_symbol} \t"
formatted_message += "\n"
return formatted_message
def fetch_poll_data(playwright, poll_id):
browser = playwright.chromium.launch(
headless=True,
args=[
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-infobars',
'--window-size=1920,1080',
'--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
]
)
page = browser.new_page()
page.set_viewport_size({"width": 1920, "height": 1080})
url = f"https://www.realclearpolitics.com/poll/race/{poll_id}/polling_data.json"
response = page.goto(url)
candidate_results = []
if response.status == 200:
try:
json_data = response.json()
for candidate_data in json_data['poll'][0]['candidate']:
candidate_info = {'name': candidate_data['name'], 'value': candidate_data['value']}
candidate_results.append(candidate_info)
except Exception as e:
print(f"Error parsing JSON data for poll_id {poll_id}: {e}")
else:
print(f"Failed to fetch data for poll_id {poll_id}: HTTP {response.status}")
print(response.text())
browser.close()
return poll_id, candidate_results
def main():
all_poll_results = {}
previous_results = load_previous_results()
with sync_playwright() as playwright:
for poll_id in poll_titles.keys():
time.sleep(random.randint(5, 14))
poll_id, results = fetch_poll_data(playwright, poll_id)
if results:
all_poll_results[poll_id] = results
differences = get_differences(all_poll_results, previous_results)
if differences:
formatted_text = format_results(differences, previous_results)
send_message_to_slack(formatted_text, username="👍 Election Bot 👎")
save_current_results(all_poll_results)
if __name__ == "__main__":
main()
Below is a script to find other polls of interest. Just toss in numbers in the poll_titles variable and let the script tell you what’s what. Note that 7383
is the 2024 General Election and 7320
is President Potato’s approval rating. I put a few seconds of sleep in between requests to keep from being flagged.
scrapey_poll_list.py
import requests
import time
def fetch_poll_titles(start, end):
base_url = "https://www.realclearpolitics.com/poll/race/{}/polling_data.json"
poll_titles = {}
for poll_id in range(start, end + 1):
url = base_url.format(poll_id)
time.sleep(2)
try:
response = requests.get(url)
response.raise_for_status() # Check for HTTP errors
data = response.json()
title = data.get('moduleInfo', {}).get('title', 'No title found')
poll_titles[poll_id] = title
except requests.exceptions.RequestException as e:
poll_titles[poll_id] = f"Error: {e}"
except ValueError:
poll_titles[poll_id] = "Invalid JSON"
return poll_titles
# Fetch titles for polls 7250 through 7400
poll_titles = fetch_poll_titles(7250, 7400)
for poll_id, title in poll_titles.items():
print(f"Poll ID: {poll_id} - Title: {title}")