Use UNICODE-compatible regular expressions.

Even though english was enough for the lord to write the bible, it's still a smart idea to allow for UNICODE characters if only to allow horses to coöperate in a way the New Yorker would approve. See: http://www.ruby-doc.org/core-1.9.3/Regexp.html#label-Character+Properties and http://www.newyorker.com/online/blogs/culture/2012/04/the-curse-of-the-diaeresis.html
2026-02-05 03:55:15 -05:00 · 2013-08-10 07:45:59 -04:00
parent 3e8bdfba47
commit 9620e24416
2 changed files with 8 additions and 8 deletions
--- a/ebook.rb
+++ b/ebook.rb
@@ -60,7 +60,7 @@ else
  source_tweets.each do |twt|
    text = twt
-    if text !~ /[\.\"\'\?\!]/
+    if text !~ /\p{Punct}/
      text += "."
    end
@@ -72,9 +72,9 @@ else
  10.times do
    tweet = markov.generate_sentence
-    if rand(3) == 0 && tweet =~ /(in|to|from|for|with|by|our|of|your|around|under|beyond)\s\w+$/ 
+    if rand(3) == 0 && tweet =~ /(in|to|from|for|with|by|our|of|your|around|under|beyond)\p{Space}\w+$/ 
      puts "Losing last word randomly"
-      tweet.gsub(/\s\w+.$/, '')   # randomly losing the last word sometimes like horse_ebooks
+      tweet.gsub(/\p{Space}\p{Word}+.$/, '')   # randomly losing the last word sometimes like horse_ebooks
    end
    if tweet.length < 40 && rand(5) == 0
@@ -86,8 +86,8 @@ else
      puts "MARKOV: #{tweet}"
    end
-    tweet_letters = tweet.gsub(/\W/, '')
+    tweet_letters = tweet.gsub(/\P{Word}/, '')
-    break if !tweet.nil? && tweet.length < 110 && !source_tweets.any? {|t| t.gsub(/\W/, '') =~ /#{tweet_letters}/ }
+    break if !tweet.nil? && tweet.length < 110 && !source_tweets.any? {|t| t.gsub(/\P{Word}/, '') =~ /#{tweet_letters}/ }
  end
  if params["tweet"]
--- a/markov.rb
+++ b/markov.rb
@@ -10,9 +10,9 @@ class MarkovChainer
   def add_text(text)
     # make sure each paragraph ends with some sentence terminator
-     text.gsub!(/\n\s*\n/m, ".")
+     text.gsub!(/[\r\n]+\p{Space}*[\r\n]+/m, ".")
     text << "."
-     seps = /([.!?;])/
+     seps = /(\p{Punct})/
     sentence = ""
     text.split(seps).each { |p|
       if seps =~ p
@@ -37,7 +37,7 @@ class MarkovChainer
 private
   def add_sentence(str, terminator)
-     words = str.scan(/[\w'’\-]+/)
+     words = str.scan(/[\p{Word}'’\-]+/)
     return unless words.size > order # ignore short sentences
     words << terminator
     buf = []