August 4, 2024 at 15:12
This isn’t quite as polished as the others yet, so it’s still rough around the edges. I’m not 100% on the Twitter API, but that’s not a big concern as stuff like this is just playing around.
This script was cobbled together from a bunch of other scripts that each do their own thing. This one, though, takes anything you can throw at it in RSS form, and sends it over to “X” as a Xweet. You can modify this to fit your needs, but it should be a decent framework to start out.
Feel free to follow @cmcwain to see the script at work.
tweet_rss.py
import feedparser
import tweepy
import os
import hashlib
from datetime import datetime
import urllib.parse
from bs4 import BeautifulSoup
api_key = "API_KEY"
api_secret_key = "API_SECRET_KEY"
client = tweepy.Client(
consumer_key=api_key,
consumer_secret=api_secret_key,
)
posted_entries_file = "./posted_news_entries.txt"
auth = tweepy.OAuth1UserHandler(api_key, api_secret_key, access_token, access_token_secret)
api = tweepy.API(auth)
if os.path.exists(posted_entries_file):
with open(posted_entries_file, "r") as file:
posted_entries = set(line.strip() for line in file)
else:
posted_entries = set()
def generate_entry_id(entry):
unique_string = entry.link + entry.title + (entry.get('updated', '') or entry.get('published', '') or entry.get('pubdate', ''))
return hashlib.md5(unique_string.encode()).hexdigest()
def is_entry_from_today(entry, date_field):
try:
if date_field == 'updated':
if 'updated' in entry:
published_date = datetime.strptime(entry.updated, '%Y-%m-%dT%H:%M:%S%z').date()
else:
return False
elif date_field == 'published':
if 'published' in entry:
published_date = datetime.strptime(entry.published, '%a, %d %b %Y %H:%M:%S %Z').date()
else:
return False
elif date_field == 'pubdate':
if 'pubdate' in entry:
published_date = datetime.strptime(entry.pubdate, '%a, %d %b %Y %H:%M:%S %Z').date()
else:
return False
else:
return False
return published_date == datetime.today().date()
except Exception as e:
print(f"Error parsing date for entry {entry.title}: {e}")
return False
def truncate_text(text, max_length=150):
if len(text) > max_length:
return text[:max_length] + "..."
return text
def modify_link(link):
parsed_url = urllib.parse.urlparse(link)
path_parts = parsed_url.path.strip('/').split('/')
new_path = '/' + '/'.join(path_parts)
new_url = urllib.parse.urlunparse((parsed_url.scheme, parsed_url.netloc, new_path, '', '', ''))
return new_url
def strip_html(content):
soup = BeautifulSoup(content, 'html.parser')
return soup.get_text()
def process_feed(rss_url, date_field):
print(f"Processing feed: {rss_url}")
feed = feedparser.parse(rss_url)
if not feed.entries:
print("No entries found.")
return
new_entries = []
for entry in feed.entries:
print(f"Checking entry: {entry.title}")
if is_entry_from_today(entry, date_field):
print(f"Entry from today found: {entry.title}")
entry_id = generate_entry_id(entry)
if entry_id not in posted_entries:
# Check for the content in various possible attributes
content = getattr(entry, 'content', None)
if content:
content = content[0].value
else:
content = getattr(entry, 'summary', '') or getattr(entry, 'description', '')
clean_content = strip_html(content)
truncated_content = truncate_text(clean_content)
modified_link = modify_link(entry.link)
tweet = f"{entry.title}\n\n{truncated_content} {modified_link}"
print(tweet)
client.create_tweet(text=tweet)
posted_entries.add(entry_id)
new_entries.append(entry_id)
else:
print(f"Entry already posted: {entry.title}")
else:
print(f"Entry not from today: {entry.title}")
if new_entries:
with open(posted_entries_file, "a") as file:
for entry_id in new_entries:
file.write(f"{entry_id}\n")
rss_feeds = [
("https://example.com/index.xml", 'updated'),
("https://notactuallychicken.com/products/feed.xml", 'pubdate')
]
for rss_url, date_field in rss_feeds:
process_feed(rss_url, date_field)