Filter out @usernames, #hashtags, and URLs

This commit is contained in:
Jacob Harris
2013-07-17 21:09:10 -04:00
parent 8ee6b1a871
commit 22f56a9ff7

View File

@@ -18,6 +18,17 @@ end
rand_key = rand($rand_limit) rand_key = rand($rand_limit)
def filtered_tweets(tweets)
include_urls = $include_urls || params["include_urls"]
source_tweets = tweets.map {|t| t.text.gsub(/\b(RT|MT) .+/, '') }
if !include_urls
source_tweets = source_tweets.reject {|t| t =~ /(https?:\/\/)/ }
end
source_tweets.map {|t| t.gsub(/(\#|@|(h\/t)|(http))\S+/, '') }
end
# randomly running only about 1 in $rand_limit times # randomly running only about 1 in $rand_limit times
unless rand_key == 0 || params["force"] unless rand_key == 0 || params["force"]
puts "Not running this time (key: #{rand_key})" puts "Not running this time (key: #{rand_key})"
@@ -26,31 +37,34 @@ else
begin begin
user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :exclude_replies => true, :include_rts => false) user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :exclude_replies => true, :include_rts => false)
max_id = user_tweets.last.id max_id = user_tweets.last.id
source_tweets += user_tweets.reject {|t| t.text =~ /(https?:\/\/)/ }.map {|t| t.text.gsub(/\b(RT|MT) .+/, '') } source_tweets += filtered_tweets(user_tweets)
# Twitter only returns up to 3200 of a user timeline, includes retweets. # Twitter only returns up to 3200 of a user timeline, includes retweets.
17.times do 17.times do
user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :max_id => max_id - 1, :exclude_replies => true, :include_rts => false) user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :max_id => max_id - 1, :exclude_replies => true, :include_rts => false)
puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}" puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}"
max_id = user_tweets.last.id max_id = user_tweets.last.id
source_tweets += user_tweets.reject {|t| t.text =~ /(https?:\/\/)/ }.map {|t| t.text.gsub(/\b(RT|MT) .+/, '') } source_tweets += filtered_tweets(user_tweets)
end end
rescue rescue
end end
puts "#{source_tweets.length} tweets found" puts "#{source_tweets.length} tweets found"
if source_tweets.length == 0
raise "Error fetching tweets from Twitter. Aborting."
end
markov = MarkovChainer.new($markov_index) markov = MarkovChainer.new($markov_index)
source_tweets.each do |twt| source_tweets.each do |twt|
text = twt text = twt
text.gsub!(/\#[\w\d]+/, '') # remove hashtags
markov.add_text(text) markov.add_text(text)
end end
tweet = nil tweet = nil
5.times do 10.times do
tweet = markov.generate_sentence tweet = markov.generate_sentence
if rand(3) == 0 && tweet =~ /(in|to|from|for|with|by|our|of|your|around|under|beyond)\s\w+$/ if rand(3) == 0 && tweet =~ /(in|to|from|for|with|by|our|of|your|around|under|beyond)\s\w+$/