better htmlentity handling, research prompted by @harrisj's update

This commit is contained in:
Tom Meagher
2013-11-11 14:11:20 -05:00
parent 912b3634d0
commit 3f4d485663

View File

@@ -3,6 +3,7 @@ import re
import sys import sys
import twitter import twitter
import markov import markov
from htmlentitydefs import name2codepoint as n2c
from local_settings import * from local_settings import *
def connect(): def connect():
@@ -12,15 +13,39 @@ def connect():
access_token_secret=MY_ACCESS_TOKEN_SECRET) access_token_secret=MY_ACCESS_TOKEN_SECRET)
return api return api
def entity(text):
if text[:2] == "&#":
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
guess = text[1:-1]
numero = n2c[guess]
try:
text = unichr(numero)
except KeyError:
pass
return text
def filter_tweet(tweet): def filter_tweet(tweet):
tweet.text = re.sub(r'\b(RT|MT) .+','',tweet.text) #take out anything after RT or MT tweet.text = re.sub(r'\b(RT|MT) .+','',tweet.text) #take out anything after RT or MT
tweet.text = re.sub(r'(\#|@|(h\/t)|(http))\S+','',tweet.text) #Take out URLs, hashtags, hts, etc. tweet.text = re.sub(r'(\#|@|(h\/t)|(http))\S+','',tweet.text) #Take out URLs, hashtags, hts, etc.
tweet.text = re.sub(r'\n','', tweet.text) #take out new lines. tweet.text = re.sub(r'\n','', tweet.text) #take out new lines.
tweet.text = re.sub(r'\"|\(|\)', '', tweet.text) #take out quotes. tweet.text = re.sub(r'\"|\(|\)', '', tweet.text) #take out quotes.
htmlsents = re.findall(r'&\w+;', tweet.text)
if len(htmlsents) > 0 :
for item in htmlsents:
tweet.text = re.sub(item, entity(item), tweet.text)
tweet.text = re.sub(r'\xe9', 'e', tweet.text) #take out accented e tweet.text = re.sub(r'\xe9', 'e', tweet.text) #take out accented e
tweet.text = re.sub(r'\&', '&', tweet.text) #clean up escaped html ampersands # tweet.text = re.sub(r'\&', '&', tweet.text) #clean up escaped html ampersands
return tweet.text return tweet.text
def grab_tweets(api, max_id=None): def grab_tweets(api, max_id=None):
source_tweets=[] source_tweets=[]
user_tweets = api.GetUserTimeline(screen_name=user, count=200, max_id=max_id, include_rts=True, trim_user=True, exclude_replies=True) user_tweets = api.GetUserTimeline(screen_name=user, count=200, max_id=max_id, include_rts=True, trim_user=True, exclude_replies=True)
@@ -101,7 +126,7 @@ if __name__=="__main__":
if DEBUG == False: if DEBUG == False:
status = api.PostUpdate(ebook_tweet) status = api.PostUpdate(ebook_tweet)
print status.text print status.text.encode('utf-8')
else: else:
print ebook_tweet print ebook_tweet