mirror of
https://github.com/thewesker/iron_ebooks.git
synced 2025-12-20 04:11:12 -05:00
145 lines
3.8 KiB
Ruby
145 lines
3.8 KiB
Ruby
# encoding: UTF-8
|
|
|
|
require 'rubygems'
|
|
require 'twitter'
|
|
require 'punkt-segmenter'
|
|
require 'twitter_init'
|
|
require 'markov'
|
|
require 'htmlentities'
|
|
|
|
source_tweets = []
|
|
|
|
$rand_limit ||= 10
|
|
$markov_index ||= 2
|
|
|
|
puts "PARAMS: #{params}" if params.any?
|
|
|
|
unless params.key?("tweet")
|
|
params["tweet"] = true
|
|
end
|
|
|
|
rand_key = rand($rand_limit)
|
|
|
|
CLOSING_PUNCTUATION = ['.', ';', ':', '?', '!']
|
|
|
|
def random_closing_punctuation
|
|
CLOSING_PUNCTUATION[rand(CLOSING_PUNCTUATION.length)]
|
|
end
|
|
|
|
def filtered_tweets(tweets)
|
|
html_decoder = HTMLEntities.new
|
|
include_urls = $include_urls || params["include_urls"]
|
|
include_replies = $include_replies || params["include_replies"]
|
|
source_tweets = tweets.map {|t| html_decoder.decode(t.text).gsub(/\b(RT|MT) .+/, '') }
|
|
|
|
if !include_urls
|
|
source_tweets = source_tweets.reject {|t| t =~ /(https?:\/\/)/ }
|
|
end
|
|
|
|
if !include_replies
|
|
source_tweets = source_tweets.reject {|t| t =~ /^@/ }
|
|
end
|
|
|
|
source_tweets.each do |t|
|
|
t.gsub!(/(\#|(h\/t)|(http))\S+/, '')
|
|
t.gsub!(/^(@[\d\w_]+\s?)+/, '')
|
|
t += "." if t !~ /[.?;:!]$/
|
|
end
|
|
|
|
source_tweets
|
|
end
|
|
|
|
# randomly running only about 1 in $rand_limit times
|
|
unless rand_key == 0 || params["force"]
|
|
puts "Not running this time (key: #{rand_key})"
|
|
else
|
|
# Fetch a thousand tweets
|
|
begin
|
|
user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :include_rts => false)
|
|
max_id = user_tweets.last.id
|
|
source_tweets += filtered_tweets(user_tweets)
|
|
|
|
# Twitter only returns up to 3200 of a user timeline, includes retweets.
|
|
17.times do
|
|
user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :max_id => max_id - 1, :include_rts => false)
|
|
puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}"
|
|
break if user_tweets.last.nil?
|
|
max_id = user_tweets.last.id
|
|
source_tweets += filtered_tweets(user_tweets)
|
|
end
|
|
rescue => ex
|
|
puts ex.message
|
|
end
|
|
|
|
puts "#{source_tweets.length} tweets found"
|
|
|
|
if source_tweets.length == 0
|
|
raise "Error fetching tweets from Twitter. Aborting."
|
|
end
|
|
|
|
markov = MarkovChainer.new($markov_index)
|
|
|
|
tokenizer = Punkt::SentenceTokenizer.new(source_tweets.join(" ")) # init with corpus of all sentences
|
|
|
|
source_tweets.each do |twt|
|
|
next if twt.nil? || twt == ''
|
|
sentences = tokenizer.sentences_from_text(twt, :output => :sentences_text)
|
|
|
|
# sentences = text.split(/[.:;?!]/)
|
|
|
|
# sentences.each do |sentence|
|
|
# next if sentence =~ /@/
|
|
|
|
# if sentence !~ /\p{Punct}$/
|
|
# sentence += "."
|
|
# end
|
|
|
|
sentences.each do |sentence|
|
|
next if sentence =~ /@/
|
|
markov.add_sentence(sentence)
|
|
end
|
|
end
|
|
|
|
tweet = nil
|
|
|
|
10.times do
|
|
tweet = markov.generate_sentence
|
|
|
|
tweet_letters = tweet.gsub(/\P{Word}/, '')
|
|
next if source_tweets.any? {|t| t.gsub(/\P{Word}/, '') =~ /#{tweet_letters}/ }
|
|
|
|
# if rand(3) == 0 && tweet =~ /(in|to|from|for|with|by|our|of|your|around|under|beyond)\p{Space}\w+$/
|
|
# puts "Losing last word randomly"
|
|
# tweet.gsub(/\p{Space}\p{Word}+.$/, '') # randomly losing the last word sometimes like horse_ebooks
|
|
# end
|
|
|
|
if tweet.length < 40 && rand(10) == 0
|
|
puts "Short tweet. Adding another sentence randomly"
|
|
next_sentence = markov.generate_sentence
|
|
tweet_letters = next_sentence.gsub(/\P{Word}/, '')
|
|
next if source_tweets.any? {|t| t.gsub(/\P{Word}/, '') =~ /#{tweet_letters}/ }
|
|
|
|
tweet += random_closing_punctuation if tweet !~ /[.;:?!]$/
|
|
tweet += " #{markov.generate_sentence}"
|
|
end
|
|
|
|
if !params["tweet"]
|
|
puts "MARKOV: #{tweet}"
|
|
end
|
|
|
|
break if !tweet.nil? && tweet.length < 110
|
|
end
|
|
|
|
if params["tweet"]
|
|
if !tweet.nil? && tweet != ''
|
|
puts "TWEET: #{tweet}"
|
|
Twitter.update(tweet)
|
|
else
|
|
raise "ERROR: EMPTY TWEET"
|
|
end
|
|
else
|
|
puts "DEBUG: #{tweet}"
|
|
end
|
|
end
|
|
|