From 3f4d485663519ad60800d770cdca7731d0d3f973 Mon Sep 17 00:00:00 2001
From: Tom Meagher <hello+git@tommeagher.com>
Date: Mon, 11 Nov 2013 14:11:20 -0500
Subject: [PATCH] better htmlentity handling, research prompted by @harrisj's
 update

---
 ebooks.py | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/ebooks.py b/ebooks.py
index 15ec6b4..01e8619 100644
--- a/ebooks.py
+++ b/ebooks.py
@@ -3,6 +3,7 @@ import re
 import sys
 import twitter
 import markov
+from htmlentitydefs import name2codepoint as n2c
 from local_settings import *
 
 def connect():
@@ -12,14 +13,38 @@ def connect():
                           access_token_secret=MY_ACCESS_TOKEN_SECRET)
     return api
 
+def entity(text):
+    if text[:2] == "&#":
+        try:
+            if text[:3] == "&#x":
+                return unichr(int(text[3:-1], 16))
+            else:
+                return unichr(int(text[2:-1]))
+        except ValueError:
+            pass
+    else:
+        guess = text[1:-1]
+        numero = n2c[guess]
+        try:
+            text = unichr(numero)
+        except KeyError:
+            pass    
+    return text
+
 def filter_tweet(tweet):
     tweet.text = re.sub(r'\b(RT|MT) .+','',tweet.text) #take out anything after RT or MT
     tweet.text = re.sub(r'(\#|@|(h\/t)|(http))\S+','',tweet.text) #Take out URLs, hashtags, hts, etc.
     tweet.text = re.sub(r'\n','', tweet.text) #take out new lines.
     tweet.text = re.sub(r'\"|\(|\)', '', tweet.text) #take out quotes.
+    htmlsents = re.findall(r'&\w+;', tweet.text)
+    if len(htmlsents) > 0 :
+        for item in htmlsents:
+            tweet.text = re.sub(item, entity(item), tweet.text)    
     tweet.text = re.sub(r'\xe9', 'e', tweet.text) #take out accented e
-    tweet.text = re.sub(r'\&amp;', '&', tweet.text) #clean up escaped html ampersands
+#    tweet.text = re.sub(r'\&amp;', '&', tweet.text) #clean up escaped html ampersands
     return tweet.text
+                     
+                     
                                                     
 def grab_tweets(api, max_id=None):
     source_tweets=[]
@@ -101,7 +126,7 @@ if __name__=="__main__":
                           
             if DEBUG == False:
                 status = api.PostUpdate(ebook_tweet)
-                print status.text
+                print status.text.encode('utf-8')
             else:
                 print ebook_tweet