mirror of
https://github.com/thewesker/iron_ebooks.git
synced 2025-12-20 04:11:12 -05:00
Tweaking sentence tokenization, support for contractions
This commit is contained in:
1
Gemfile
1
Gemfile
@@ -5,3 +5,4 @@ ruby "1.9.3"
|
|||||||
gem "twitter"
|
gem "twitter"
|
||||||
gem "typhoeus"
|
gem "typhoeus"
|
||||||
gem "iron_worker_ng"
|
gem "iron_worker_ng"
|
||||||
|
gem 'punkt-segmenter'
|
||||||
|
|||||||
44
Gemfile.lock
Normal file
44
Gemfile.lock
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
GEM
|
||||||
|
remote: https://rubygems.org/
|
||||||
|
specs:
|
||||||
|
ethon (0.5.12)
|
||||||
|
ffi (>= 1.3.0)
|
||||||
|
mime-types (~> 1.18)
|
||||||
|
faraday (0.8.7)
|
||||||
|
multipart-post (~> 1.1)
|
||||||
|
ffi (1.9.0)
|
||||||
|
iron_core (0.6.2)
|
||||||
|
rest (>= 2.2.0)
|
||||||
|
iron_worker_ng (0.16.4)
|
||||||
|
bundler (>= 1.2.0)
|
||||||
|
iron_core (>= 0.5.1)
|
||||||
|
rubyzip (>= 0.9.9)
|
||||||
|
mime-types (1.23)
|
||||||
|
multi_json (1.7.7)
|
||||||
|
multipart-post (1.2.0)
|
||||||
|
net-http-persistent (2.8)
|
||||||
|
punkt-segmenter (0.9.1)
|
||||||
|
unicode_utils (>= 1.0.0)
|
||||||
|
rest (2.6.3)
|
||||||
|
net-http-persistent
|
||||||
|
rest-client (>= 0.3.0)
|
||||||
|
rest-client (1.6.7)
|
||||||
|
mime-types (>= 1.16)
|
||||||
|
rubyzip (0.9.9)
|
||||||
|
simple_oauth (0.2.0)
|
||||||
|
twitter (4.7.0)
|
||||||
|
faraday (~> 0.8, < 0.10)
|
||||||
|
multi_json (~> 1.0)
|
||||||
|
simple_oauth (~> 0.2)
|
||||||
|
typhoeus (0.6.3)
|
||||||
|
ethon (~> 0.5.11)
|
||||||
|
unicode_utils (1.4.0)
|
||||||
|
|
||||||
|
PLATFORMS
|
||||||
|
ruby
|
||||||
|
|
||||||
|
DEPENDENCIES
|
||||||
|
iron_worker_ng
|
||||||
|
punkt-segmenter
|
||||||
|
twitter
|
||||||
|
typhoeus
|
||||||
34
ebook.rb
34
ebook.rb
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
require 'rubygems'
|
require 'rubygems'
|
||||||
require 'twitter'
|
require 'twitter'
|
||||||
|
require 'punkt-segmenter'
|
||||||
require 'twitter_init'
|
require 'twitter_init'
|
||||||
require 'markov'
|
require 'markov'
|
||||||
|
|
||||||
@@ -27,7 +27,12 @@ def filtered_tweets(tweets)
|
|||||||
source_tweets = source_tweets.reject {|t| t =~ /(https?:\/\/)/ }
|
source_tweets = source_tweets.reject {|t| t =~ /(https?:\/\/)/ }
|
||||||
end
|
end
|
||||||
|
|
||||||
source_tweets.map {|t| t.gsub(/(\#|@|(h\/t)|(http))\S+/, '') }
|
source_tweets.each do |t|
|
||||||
|
t.gsub!(/(\#|(h\/t)|(http))\S+/, '')
|
||||||
|
t += "." if t !~ /[.?;:!]$/
|
||||||
|
end
|
||||||
|
|
||||||
|
source_tweets
|
||||||
end
|
end
|
||||||
|
|
||||||
# randomly running only about 1 in $rand_limit times
|
# randomly running only about 1 in $rand_limit times
|
||||||
@@ -44,6 +49,7 @@ else
|
|||||||
17.times do
|
17.times do
|
||||||
user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :max_id => max_id - 1, :exclude_replies => true, :include_rts => false)
|
user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :max_id => max_id - 1, :exclude_replies => true, :include_rts => false)
|
||||||
puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}"
|
puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}"
|
||||||
|
break if user_tweets.last.nil?
|
||||||
max_id = user_tweets.last.id
|
max_id = user_tweets.last.id
|
||||||
source_tweets += filtered_tweets(user_tweets)
|
source_tweets += filtered_tweets(user_tweets)
|
||||||
end
|
end
|
||||||
@@ -57,20 +63,24 @@ else
|
|||||||
end
|
end
|
||||||
|
|
||||||
markov = MarkovChainer.new($markov_index)
|
markov = MarkovChainer.new($markov_index)
|
||||||
|
|
||||||
source_tweets.each do |twt|
|
|
||||||
text = twt
|
|
||||||
|
|
||||||
sentences = text.split(/\p{Punct}/)
|
tokenizer = Punkt::SentenceTokenizer.new(source_tweets.join(" ")) # init with corpus of all sentences
|
||||||
|
|
||||||
|
source_tweets.each do |twt|
|
||||||
|
sentences = tokenizer.sentences_from_text(twt, :output => :sentences_text)
|
||||||
|
|
||||||
|
# sentences = text.split(/[.:;?!]/)
|
||||||
|
|
||||||
|
# sentences.each do |sentence|
|
||||||
|
# next if sentence =~ /@/
|
||||||
|
|
||||||
|
# if sentence !~ /\p{Punct}$/
|
||||||
|
# sentence += "."
|
||||||
|
# end
|
||||||
|
|
||||||
sentences.each do |sentence|
|
sentences.each do |sentence|
|
||||||
next if sentence =~ /@/
|
next if sentence =~ /@/
|
||||||
|
markov.add_sentence(sentence)
|
||||||
if sentence !~ /\p{Punct}$/
|
|
||||||
sentence += "."
|
|
||||||
end
|
|
||||||
|
|
||||||
markov.add_text(sentence)
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
@@ -7,3 +7,4 @@ file "twitter_init.rb"
|
|||||||
file "markov.rb"
|
file "markov.rb"
|
||||||
|
|
||||||
gem 'twitter'
|
gem 'twitter'
|
||||||
|
gem 'punkt-segmenter'
|
||||||
|
|||||||
54
markov.rb
54
markov.rb
@@ -1,5 +1,8 @@
|
|||||||
#encoding: UTF-8
|
#encoding: UTF-8
|
||||||
|
|
||||||
|
CONTRACTION_APOSTROPHE_SUBSTITUTE = 'qqq'
|
||||||
|
CONTRACTIONS = %w(aren't can't couldn't didn't doesn't don't hadn't hasn't haven't he'd he'll he's I'd I'll I'm I've isn't it's let's mightn't mustn't shan't she'd she'll she's shouldn't that's there's they'd they'll they're they've we'd we're we've weren't what'll what're what's what've where's who'd who'll who're who's who've won't wouldn't you'd you'll you're you've)
|
||||||
|
|
||||||
class MarkovChainer
|
class MarkovChainer
|
||||||
attr_reader :order
|
attr_reader :order
|
||||||
def initialize(order)
|
def initialize(order)
|
||||||
@@ -9,19 +12,22 @@ class MarkovChainer
|
|||||||
end
|
end
|
||||||
|
|
||||||
def add_text(text)
|
def add_text(text)
|
||||||
# make sure each paragraph ends with some sentence terminator
|
# remove curly apostrophes
|
||||||
text.gsub!(/[\r\n]+\p{Space}*[\r\n]+/m, ".")
|
text.gsub!(/’/, "'")
|
||||||
text << "."
|
|
||||||
seps = /(\p{Punct})/
|
# make sure each paragraph ends with some sentence terminator
|
||||||
sentence = ""
|
text.gsub!(/[\r\n]+\p{Space}*[\r\n]+/m, ".")
|
||||||
text.split(seps).each { |p|
|
text << "."
|
||||||
if seps =~ p
|
seps = /[.:;?!]/
|
||||||
add_sentence(sentence, p)
|
sentence = ""
|
||||||
sentence = ""
|
text.split(seps).each { |p|
|
||||||
else
|
if seps =~ p
|
||||||
sentence = p
|
add_sentence(sentence, p)
|
||||||
end
|
sentence = ""
|
||||||
}
|
else
|
||||||
|
sentence = p
|
||||||
|
end
|
||||||
|
}
|
||||||
end
|
end
|
||||||
|
|
||||||
def generate_sentence
|
def generate_sentence
|
||||||
@@ -29,14 +35,32 @@ class MarkovChainer
|
|||||||
return nil if res.nil?
|
return nil if res.nil?
|
||||||
loop {
|
loop {
|
||||||
unless nw = next_word_for(res[-order, order])
|
unless nw = next_word_for(res[-order, order])
|
||||||
return res[0..-2].join(" ") + res.last
|
out = res[0..-2].join(" ") + (res.last || '.')
|
||||||
|
out.gsub!(CONTRACTION_APOSTROPHE_SUBSTITUTE, "'")
|
||||||
|
return out
|
||||||
end
|
end
|
||||||
res << nw
|
res << nw
|
||||||
}
|
}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def add_sentence(sentence)
|
||||||
|
if sentence =~ /[.:;?!]$/
|
||||||
|
add_sentence_internal(sentence[0, sentence.length - 1], sentence[sentence.length - 1, 1])
|
||||||
|
else
|
||||||
|
add_sentence_internal(sentence, '.')
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
private
|
private
|
||||||
def add_sentence(str, terminator)
|
def add_sentence_internal(str, terminator)
|
||||||
|
str.gsub!(/’/, "'")
|
||||||
|
|
||||||
|
CONTRACTIONS.each do |c|
|
||||||
|
str.gsub!(/#{c}/i) {|m| m.gsub("'", CONTRACTION_APOSTROPHE_SUBSTITUTE)}
|
||||||
|
end
|
||||||
|
str.gsub!(/'s/, CONTRACTION_APOSTROPHE_SUBSTITUTE)
|
||||||
|
|
||||||
|
#puts str
|
||||||
words = str.scan(/[\p{Word}'’\-]+/)
|
words = str.scan(/[\p{Word}'’\-]+/)
|
||||||
return unless words.size > order # ignore short sentences
|
return unless words.size > order # ignore short sentences
|
||||||
words << terminator
|
words << terminator
|
||||||
|
|||||||
Reference in New Issue
Block a user