Use UNICODE-compatible regular expressions.

Even though english was enough for the lord to write the bible, it's still a smart idea to allow for UNICODE characters if only to allow horses to coöperate in a way the New Yorker would approve.

See:
http://www.ruby-doc.org/core-1.9.3/Regexp.html#label-Character+Properties 
and 
http://www.newyorker.com/online/blogs/culture/2012/04/the-curse-of-the-diaeresis.html
This commit is contained in:
Sebastian Delmont
2013-08-10 07:45:59 -04:00
parent 3e8bdfba47
commit 9620e24416
2 changed files with 8 additions and 8 deletions

View File

@@ -60,7 +60,7 @@ else
source_tweets.each do |twt| source_tweets.each do |twt|
text = twt text = twt
if text !~ /[\.\"\'\?\!]/ if text !~ /\p{Punct}/
text += "." text += "."
end end
@@ -72,9 +72,9 @@ else
10.times do 10.times do
tweet = markov.generate_sentence tweet = markov.generate_sentence
if rand(3) == 0 && tweet =~ /(in|to|from|for|with|by|our|of|your|around|under|beyond)\s\w+$/ if rand(3) == 0 && tweet =~ /(in|to|from|for|with|by|our|of|your|around|under|beyond)\p{Space}\w+$/
puts "Losing last word randomly" puts "Losing last word randomly"
tweet.gsub(/\s\w+.$/, '') # randomly losing the last word sometimes like horse_ebooks tweet.gsub(/\p{Space}\p{Word}+.$/, '') # randomly losing the last word sometimes like horse_ebooks
end end
if tweet.length < 40 && rand(5) == 0 if tweet.length < 40 && rand(5) == 0
@@ -86,8 +86,8 @@ else
puts "MARKOV: #{tweet}" puts "MARKOV: #{tweet}"
end end
tweet_letters = tweet.gsub(/\W/, '') tweet_letters = tweet.gsub(/\P{Word}/, '')
break if !tweet.nil? && tweet.length < 110 && !source_tweets.any? {|t| t.gsub(/\W/, '') =~ /#{tweet_letters}/ } break if !tweet.nil? && tweet.length < 110 && !source_tweets.any? {|t| t.gsub(/\P{Word}/, '') =~ /#{tweet_letters}/ }
end end
if params["tweet"] if params["tweet"]

View File

@@ -10,9 +10,9 @@ class MarkovChainer
def add_text(text) def add_text(text)
# make sure each paragraph ends with some sentence terminator # make sure each paragraph ends with some sentence terminator
text.gsub!(/\n\s*\n/m, ".") text.gsub!(/[\r\n]+\p{Space}*[\r\n]+/m, ".")
text << "." text << "."
seps = /([.!?;])/ seps = /(\p{Punct})/
sentence = "" sentence = ""
text.split(seps).each { |p| text.split(seps).each { |p|
if seps =~ p if seps =~ p
@@ -37,7 +37,7 @@ class MarkovChainer
private private
def add_sentence(str, terminator) def add_sentence(str, terminator)
words = str.scan(/[\w'\-]+/) words = str.scan(/[\p{Word}'\-]+/)
return unless words.size > order # ignore short sentences return unless words.size > order # ignore short sentences
words << terminator words << terminator
buf = [] buf = []