Use UNICODE-compatible regular expressions.

Even though english was enough for the lord to write the bible, it's still a smart idea to allow for UNICODE characters if only to allow horses to coöperate in a way the New Yorker would approve. See: http://www.ruby-doc.org/core-1.9.3/Regexp.html#label-Character+Properties and http://www.newyorker.com/online/blogs/culture/2012/04/the-curse-of-the-diaeresis.html
2025-12-20 04:11:12 -05:00 · 2013-08-10 07:45:59 -04:00
parent 3e8bdfba47
commit 9620e24416
2 changed files with 8 additions and 8 deletions
--- a/markov.rb
+++ b/markov.rb
@@ -10,9 +10,9 @@ class MarkovChainer

   def add_text(text)
     # make sure each paragraph ends with some sentence terminator
-     text.gsub!(/\n\s*\n/m, ".")
+     text.gsub!(/[\r\n]+\p{Space}*[\r\n]+/m, ".")
     text << "."
-     seps = /([.!?;])/
+     seps = /(\p{Punct})/
     sentence = ""
     text.split(seps).each { |p|
       if seps =~ p
@@ -37,7 +37,7 @@ class MarkovChainer

 private
   def add_sentence(str, terminator)
-     words = str.scan(/[\w'’\-]+/)
+     words = str.scan(/[\p{Word}'’\-]+/)
     return unless words.size > order # ignore short sentences
     words << terminator
     buf = []