Tweaking sentence tokenization, support for contractions

This commit is contained in:
Jacob Harris
2013-08-17 21:38:14 -04:00
parent 7061990d1e
commit b190084a02
5 changed files with 107 additions and 27 deletions

View File

@@ -5,3 +5,4 @@ ruby "1.9.3"
gem "twitter" gem "twitter"
gem "typhoeus" gem "typhoeus"
gem "iron_worker_ng" gem "iron_worker_ng"
gem 'punkt-segmenter'

44
Gemfile.lock Normal file
View File

@@ -0,0 +1,44 @@
GEM
remote: https://rubygems.org/
specs:
ethon (0.5.12)
ffi (>= 1.3.0)
mime-types (~> 1.18)
faraday (0.8.7)
multipart-post (~> 1.1)
ffi (1.9.0)
iron_core (0.6.2)
rest (>= 2.2.0)
iron_worker_ng (0.16.4)
bundler (>= 1.2.0)
iron_core (>= 0.5.1)
rubyzip (>= 0.9.9)
mime-types (1.23)
multi_json (1.7.7)
multipart-post (1.2.0)
net-http-persistent (2.8)
punkt-segmenter (0.9.1)
unicode_utils (>= 1.0.0)
rest (2.6.3)
net-http-persistent
rest-client (>= 0.3.0)
rest-client (1.6.7)
mime-types (>= 1.16)
rubyzip (0.9.9)
simple_oauth (0.2.0)
twitter (4.7.0)
faraday (~> 0.8, < 0.10)
multi_json (~> 1.0)
simple_oauth (~> 0.2)
typhoeus (0.6.3)
ethon (~> 0.5.11)
unicode_utils (1.4.0)
PLATFORMS
ruby
DEPENDENCIES
iron_worker_ng
punkt-segmenter
twitter
typhoeus

View File

@@ -2,7 +2,7 @@
require 'rubygems' require 'rubygems'
require 'twitter' require 'twitter'
require 'punkt-segmenter'
require 'twitter_init' require 'twitter_init'
require 'markov' require 'markov'
@@ -27,7 +27,12 @@ def filtered_tweets(tweets)
source_tweets = source_tweets.reject {|t| t =~ /(https?:\/\/)/ } source_tweets = source_tweets.reject {|t| t =~ /(https?:\/\/)/ }
end end
source_tweets.map {|t| t.gsub(/(\#|@|(h\/t)|(http))\S+/, '') } source_tweets.each do |t|
t.gsub!(/(\#|(h\/t)|(http))\S+/, '')
t += "." if t !~ /[.?;:!]$/
end
source_tweets
end end
# randomly running only about 1 in $rand_limit times # randomly running only about 1 in $rand_limit times
@@ -44,6 +49,7 @@ else
17.times do 17.times do
user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :max_id => max_id - 1, :exclude_replies => true, :include_rts => false) user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :max_id => max_id - 1, :exclude_replies => true, :include_rts => false)
puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}" puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}"
break if user_tweets.last.nil?
max_id = user_tweets.last.id max_id = user_tweets.last.id
source_tweets += filtered_tweets(user_tweets) source_tweets += filtered_tweets(user_tweets)
end end
@@ -57,20 +63,24 @@ else
end end
markov = MarkovChainer.new($markov_index) markov = MarkovChainer.new($markov_index)
source_tweets.each do |twt|
text = twt
sentences = text.split(/\p{Punct}/) tokenizer = Punkt::SentenceTokenizer.new(source_tweets.join(" ")) # init with corpus of all sentences
source_tweets.each do |twt|
sentences = tokenizer.sentences_from_text(twt, :output => :sentences_text)
# sentences = text.split(/[.:;?!]/)
# sentences.each do |sentence|
# next if sentence =~ /@/
# if sentence !~ /\p{Punct}$/
# sentence += "."
# end
sentences.each do |sentence| sentences.each do |sentence|
next if sentence =~ /@/ next if sentence =~ /@/
markov.add_sentence(sentence)
if sentence !~ /\p{Punct}$/
sentence += "."
end
markov.add_text(sentence)
end end
end end

View File

@@ -7,3 +7,4 @@ file "twitter_init.rb"
file "markov.rb" file "markov.rb"
gem 'twitter' gem 'twitter'
gem 'punkt-segmenter'

View File

@@ -1,5 +1,8 @@
#encoding: UTF-8 #encoding: UTF-8
CONTRACTION_APOSTROPHE_SUBSTITUTE = 'qqq'
CONTRACTIONS = %w(aren't can't couldn't didn't doesn't don't hadn't hasn't haven't he'd he'll he's I'd I'll I'm I've isn't it's let's mightn't mustn't shan't she'd she'll she's shouldn't that's there's they'd they'll they're they've we'd we're we've weren't what'll what're what's what've where's who'd who'll who're who's who've won't wouldn't you'd you'll you're you've)
class MarkovChainer class MarkovChainer
attr_reader :order attr_reader :order
def initialize(order) def initialize(order)
@@ -9,19 +12,22 @@ class MarkovChainer
end end
def add_text(text) def add_text(text)
# make sure each paragraph ends with some sentence terminator # remove curly apostrophes
text.gsub!(/[\r\n]+\p{Space}*[\r\n]+/m, ".") text.gsub!(//, "'")
text << "."
seps = /(\p{Punct})/ # make sure each paragraph ends with some sentence terminator
sentence = "" text.gsub!(/[\r\n]+\p{Space}*[\r\n]+/m, ".")
text.split(seps).each { |p| text << "."
if seps =~ p seps = /[.:;?!]/
add_sentence(sentence, p) sentence = ""
sentence = "" text.split(seps).each { |p|
else if seps =~ p
sentence = p add_sentence(sentence, p)
end sentence = ""
} else
sentence = p
end
}
end end
def generate_sentence def generate_sentence
@@ -29,14 +35,32 @@ class MarkovChainer
return nil if res.nil? return nil if res.nil?
loop { loop {
unless nw = next_word_for(res[-order, order]) unless nw = next_word_for(res[-order, order])
return res[0..-2].join(" ") + res.last out = res[0..-2].join(" ") + (res.last || '.')
out.gsub!(CONTRACTION_APOSTROPHE_SUBSTITUTE, "'")
return out
end end
res << nw res << nw
} }
end end
def add_sentence(sentence)
if sentence =~ /[.:;?!]$/
add_sentence_internal(sentence[0, sentence.length - 1], sentence[sentence.length - 1, 1])
else
add_sentence_internal(sentence, '.')
end
end
private private
def add_sentence(str, terminator) def add_sentence_internal(str, terminator)
str.gsub!(//, "'")
CONTRACTIONS.each do |c|
str.gsub!(/#{c}/i) {|m| m.gsub("'", CONTRACTION_APOSTROPHE_SUBSTITUTE)}
end
str.gsub!(/'s/, CONTRACTION_APOSTROPHE_SUBSTITUTE)
#puts str
words = str.scan(/[\p{Word}'\-]+/) words = str.scan(/[\p{Word}'\-]+/)
return unless words.size > order # ignore short sentences return unless words.size > order # ignore short sentences
words << terminator words << terminator