From b190084a025f0e4a15a56e2e294c697ad0c27c6e Mon Sep 17 00:00:00 2001 From: Jacob Harris Date: Sat, 17 Aug 2013 21:38:14 -0400 Subject: [PATCH] Tweaking sentence tokenization, support for contractions --- Gemfile | 1 + Gemfile.lock | 44 ++++++++++++++++++++++++++++++++++++++++++ ebook.rb | 34 +++++++++++++++++++++------------ ebook.worker | 1 + markov.rb | 54 +++++++++++++++++++++++++++++++++++++--------------- 5 files changed, 107 insertions(+), 27 deletions(-) create mode 100644 Gemfile.lock diff --git a/Gemfile b/Gemfile index b0d3d90..8c89053 100644 --- a/Gemfile +++ b/Gemfile @@ -5,3 +5,4 @@ ruby "1.9.3" gem "twitter" gem "typhoeus" gem "iron_worker_ng" +gem 'punkt-segmenter' diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..903a9ee --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,44 @@ +GEM + remote: https://rubygems.org/ + specs: + ethon (0.5.12) + ffi (>= 1.3.0) + mime-types (~> 1.18) + faraday (0.8.7) + multipart-post (~> 1.1) + ffi (1.9.0) + iron_core (0.6.2) + rest (>= 2.2.0) + iron_worker_ng (0.16.4) + bundler (>= 1.2.0) + iron_core (>= 0.5.1) + rubyzip (>= 0.9.9) + mime-types (1.23) + multi_json (1.7.7) + multipart-post (1.2.0) + net-http-persistent (2.8) + punkt-segmenter (0.9.1) + unicode_utils (>= 1.0.0) + rest (2.6.3) + net-http-persistent + rest-client (>= 0.3.0) + rest-client (1.6.7) + mime-types (>= 1.16) + rubyzip (0.9.9) + simple_oauth (0.2.0) + twitter (4.7.0) + faraday (~> 0.8, < 0.10) + multi_json (~> 1.0) + simple_oauth (~> 0.2) + typhoeus (0.6.3) + ethon (~> 0.5.11) + unicode_utils (1.4.0) + +PLATFORMS + ruby + +DEPENDENCIES + iron_worker_ng + punkt-segmenter + twitter + typhoeus diff --git a/ebook.rb b/ebook.rb index 9befa9a..69fbe2d 100644 --- a/ebook.rb +++ b/ebook.rb @@ -2,7 +2,7 @@ require 'rubygems' require 'twitter' - +require 'punkt-segmenter' require 'twitter_init' require 'markov' @@ -27,7 +27,12 @@ def filtered_tweets(tweets) source_tweets = source_tweets.reject {|t| t =~ /(https?:\/\/)/ } end - source_tweets.map {|t| t.gsub(/(\#|@|(h\/t)|(http))\S+/, '') } + source_tweets.each do |t| + t.gsub!(/(\#|(h\/t)|(http))\S+/, '') + t += "." if t !~ /[.?;:!]$/ + end + + source_tweets end # randomly running only about 1 in $rand_limit times @@ -44,6 +49,7 @@ else 17.times do user_tweets = Twitter.user_timeline($source_account, :count => 200, :trim_user => true, :max_id => max_id - 1, :exclude_replies => true, :include_rts => false) puts "MAX_ID #{max_id} TWEETS: #{user_tweets.length}" + break if user_tweets.last.nil? max_id = user_tweets.last.id source_tweets += filtered_tweets(user_tweets) end @@ -57,20 +63,24 @@ else end markov = MarkovChainer.new($markov_index) - - source_tweets.each do |twt| - text = twt - sentences = text.split(/\p{Punct}/) + tokenizer = Punkt::SentenceTokenizer.new(source_tweets.join(" ")) # init with corpus of all sentences + + source_tweets.each do |twt| + sentences = tokenizer.sentences_from_text(twt, :output => :sentences_text) + + # sentences = text.split(/[.:;?!]/) + + # sentences.each do |sentence| + # next if sentence =~ /@/ + + # if sentence !~ /\p{Punct}$/ + # sentence += "." + # end sentences.each do |sentence| next if sentence =~ /@/ - - if sentence !~ /\p{Punct}$/ - sentence += "." - end - - markov.add_text(sentence) + markov.add_sentence(sentence) end end diff --git a/ebook.worker b/ebook.worker index 33ff648..e567c15 100644 --- a/ebook.worker +++ b/ebook.worker @@ -7,3 +7,4 @@ file "twitter_init.rb" file "markov.rb" gem 'twitter' +gem 'punkt-segmenter' diff --git a/markov.rb b/markov.rb index 9b0b47b..7022101 100644 --- a/markov.rb +++ b/markov.rb @@ -1,5 +1,8 @@ #encoding: UTF-8 +CONTRACTION_APOSTROPHE_SUBSTITUTE = 'qqq' +CONTRACTIONS = %w(aren't can't couldn't didn't doesn't don't hadn't hasn't haven't he'd he'll he's I'd I'll I'm I've isn't it's let's mightn't mustn't shan't she'd she'll she's shouldn't that's there's they'd they'll they're they've we'd we're we've weren't what'll what're what's what've where's who'd who'll who're who's who've won't wouldn't you'd you'll you're you've) + class MarkovChainer attr_reader :order def initialize(order) @@ -9,19 +12,22 @@ class MarkovChainer end def add_text(text) - # make sure each paragraph ends with some sentence terminator - text.gsub!(/[\r\n]+\p{Space}*[\r\n]+/m, ".") - text << "." - seps = /(\p{Punct})/ - sentence = "" - text.split(seps).each { |p| - if seps =~ p - add_sentence(sentence, p) - sentence = "" - else - sentence = p - end - } + # remove curly apostrophes + text.gsub!(/’/, "'") + + # make sure each paragraph ends with some sentence terminator + text.gsub!(/[\r\n]+\p{Space}*[\r\n]+/m, ".") + text << "." + seps = /[.:;?!]/ + sentence = "" + text.split(seps).each { |p| + if seps =~ p + add_sentence(sentence, p) + sentence = "" + else + sentence = p + end + } end def generate_sentence @@ -29,14 +35,32 @@ class MarkovChainer return nil if res.nil? loop { unless nw = next_word_for(res[-order, order]) - return res[0..-2].join(" ") + res.last + out = res[0..-2].join(" ") + (res.last || '.') + out.gsub!(CONTRACTION_APOSTROPHE_SUBSTITUTE, "'") + return out end res << nw } end + def add_sentence(sentence) + if sentence =~ /[.:;?!]$/ + add_sentence_internal(sentence[0, sentence.length - 1], sentence[sentence.length - 1, 1]) + else + add_sentence_internal(sentence, '.') + end + end + private - def add_sentence(str, terminator) + def add_sentence_internal(str, terminator) + str.gsub!(/’/, "'") + + CONTRACTIONS.each do |c| + str.gsub!(/#{c}/i) {|m| m.gsub("'", CONTRACTION_APOSTROPHE_SUBSTITUTE)} + end + str.gsub!(/'s/, CONTRACTION_APOSTROPHE_SUBSTITUTE) + + #puts str words = str.scan(/[\p{Word}'’\-]+/) return unless words.size > order # ignore short sentences words << terminator