mirror of
https://github.com/thewesker/iron_ebooks.git
synced 2025-12-20 04:11:12 -05:00
Decoding HTML entities in tweets
This commit is contained in:
1
Gemfile
1
Gemfile
@@ -6,3 +6,4 @@ gem "twitter"
|
|||||||
gem "typhoeus"
|
gem "typhoeus"
|
||||||
gem "iron_worker_ng"
|
gem "iron_worker_ng"
|
||||||
gem 'punkt-segmenter'
|
gem 'punkt-segmenter'
|
||||||
|
gem 'htmlentities'
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ GEM
|
|||||||
faraday (0.8.7)
|
faraday (0.8.7)
|
||||||
multipart-post (~> 1.1)
|
multipart-post (~> 1.1)
|
||||||
ffi (1.9.0)
|
ffi (1.9.0)
|
||||||
|
htmlentities (4.3.1)
|
||||||
iron_core (0.6.2)
|
iron_core (0.6.2)
|
||||||
rest (>= 2.2.0)
|
rest (>= 2.2.0)
|
||||||
iron_worker_ng (0.16.4)
|
iron_worker_ng (0.16.4)
|
||||||
@@ -38,6 +39,7 @@ PLATFORMS
|
|||||||
ruby
|
ruby
|
||||||
|
|
||||||
DEPENDENCIES
|
DEPENDENCIES
|
||||||
|
htmlentities
|
||||||
iron_worker_ng
|
iron_worker_ng
|
||||||
punkt-segmenter
|
punkt-segmenter
|
||||||
twitter
|
twitter
|
||||||
|
|||||||
11
ebook.rb
11
ebook.rb
@@ -5,6 +5,7 @@ require 'twitter'
|
|||||||
require 'punkt-segmenter'
|
require 'punkt-segmenter'
|
||||||
require 'twitter_init'
|
require 'twitter_init'
|
||||||
require 'markov'
|
require 'markov'
|
||||||
|
require 'htmlentities'
|
||||||
|
|
||||||
source_tweets = []
|
source_tweets = []
|
||||||
|
|
||||||
@@ -26,9 +27,10 @@ def random_closing_punctuation
|
|||||||
end
|
end
|
||||||
|
|
||||||
def filtered_tweets(tweets)
|
def filtered_tweets(tweets)
|
||||||
|
html_decoder = HTMLEntities.new
|
||||||
include_urls = $include_urls || params["include_urls"]
|
include_urls = $include_urls || params["include_urls"]
|
||||||
include_replies = $include_replies || params["include_replies"]
|
include_replies = $include_replies || params["include_replies"]
|
||||||
source_tweets = tweets.map {|t| t.text.gsub(/\b(RT|MT) .+/, '') }
|
source_tweets = tweets.map {|t| html_decoder.decode(t.text).gsub(/\b(RT|MT) .+/, '') }
|
||||||
|
|
||||||
if !include_urls
|
if !include_urls
|
||||||
source_tweets = source_tweets.reject {|t| t =~ /(https?:\/\/)/ }
|
source_tweets = source_tweets.reject {|t| t =~ /(https?:\/\/)/ }
|
||||||
@@ -53,19 +55,20 @@ unless rand_key == 0 || params["force"]
|
|||||||
else
|
else
|
||||||
# Fetch a thousand tweets
|
# Fetch a thousand tweets
|
||||||
begin
|
begin
|
||||||
user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :exclude_replies => false, :include_rts => false)
|
user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :include_rts => false)
|
||||||
max_id = user_tweets.last.id
|
max_id = user_tweets.last.id
|
||||||
source_tweets += filtered_tweets(user_tweets)
|
source_tweets += filtered_tweets(user_tweets)
|
||||||
|
|
||||||
# Twitter only returns up to 3200 of a user timeline, includes retweets.
|
# Twitter only returns up to 3200 of a user timeline, includes retweets.
|
||||||
17.times do
|
17.times do
|
||||||
user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :max_id => max_id - 1, :exclude_replies => false, :include_rts => false)
|
user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :max_id => max_id - 1, :include_rts => false)
|
||||||
puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}"
|
puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}"
|
||||||
break if user_tweets.last.nil?
|
break if user_tweets.last.nil?
|
||||||
max_id = user_tweets.last.id
|
max_id = user_tweets.last.id
|
||||||
source_tweets += filtered_tweets(user_tweets)
|
source_tweets += filtered_tweets(user_tweets)
|
||||||
end
|
end
|
||||||
rescue
|
rescue => ex
|
||||||
|
puts ex.message
|
||||||
end
|
end
|
||||||
|
|
||||||
puts "#{source_tweets.length} tweets found"
|
puts "#{source_tweets.length} tweets found"
|
||||||
|
|||||||
@@ -8,3 +8,4 @@ file "markov.rb"
|
|||||||
|
|
||||||
gem 'twitter'
|
gem 'twitter'
|
||||||
gem 'punkt-segmenter'
|
gem 'punkt-segmenter'
|
||||||
|
gem 'htmlentities'
|
||||||
|
|||||||
Reference in New Issue
Block a user